# Context Selection Development

**TODO 10.03.18**

Currently the algorithm excludes phrase atoms that begin with the direct object marker. This is a problem, of course, for Objc function phrases. `is_preposition_subj` should be rewritten to check for valid phrase types. This may be tricky since not all את markers will be functioning as an object marker. So: fixing this requires first an exploration of the subphrase relation codes or phrase atom relations that can occur with את. I'm assuming that the specification relation will communicate an adjectival, rather than grammatical, force.

This gets even trickier when nouns in the object phrase are marked by markers other than את. I should see whether this happens, and if it does, can a workaround be built with the relation features.

For testing: 

('Joshua', 24, 18) <br>
722643 אֶת־כָּל־הָעַמִּ֗ים וְאֶת־הָאֱמֹרִ֛י <br>
sp:  אֶת־כָּל־הָעַמִּ֗ים <br>
par:  אֶת־הָאֱמֹרִ֛י <br>
עַמִּ֗ים <br>

In [1]:
# First, I load the necesssary modules, data, and helper functions.
import collections
from tf.fabric import Fabric
from functions.helpers import show_results, filter_results

# load BHSA data into TF
TF = Fabric(locations='~/github/etcbc/bhsa/tf', modules='c', silent=True)
api = TF.load('''
                book chapter verse
                function sp pdp mother
                rela typ lex
              ''', silent=True)
api.makeAvailableIn(globals()) # globalize TF methods

In [126]:
def is_preposition_subj(word):
    '''
    Return boolean on whether a word is a preposition subject,
    necessary for cases in which the subject is marked in
    a prepositional phrase, such as in passive clauses.
    Require a word node.
    
    *Caution*
    Does not capture cases such as Gen 21:5 (ca# 516487)
    '''
    # get word phrase
    w_phrase = L.u(word, otype='phrase')[0]
    
    # get all phrase atoms in the phrase    
    # exclude negations and conjunctions
    phrase_atoms = [phrs_at for phrs_at in L.d(w_phrase, otype='phrase_atom')
                        if F.typ.v(phrs_at) not in {'NegP','CP'}
                   ]
    
    # check whether the only phrase atom in the phrase is a prep. phrase
    if len(phrase_atoms) == 1 and F.typ.v(phrase_atoms[0]) == 'PP':
        
        # is a prepositional subject
        return True

    else:
        # is not a prep subj
        return False

def get_KL_head(KL_wordnode, good_pdp, good_sp):
    '''
    Extract the head noun in a כֹל construct chain.
    The function simply returns the first substantive in the chain.
    '''
    
    rectum = E.mother.t(KL_wordnode) # get rectum subphrase
    KL_phrase = L.d(L.u(KL_wordnode, 'phrase')[0], 'word') # for phrase boundary
    
    if not rectum:
        return False  # KL not in norm. construct (e.g. w/ verbs)
    
    # get words and nouns in the rectum subphrase
    r_words = L.d(rectum[0], 'word')
    r_nouns = [w for w in r_words 
               if F.sp.v(w) in good_sp
               and F.pdp.v(w) in good_pdp
               and w in KL_phrase
              ]
    
    if r_nouns:
        return r_nouns[0] # return the first noun
    else:
        return None # no noun found, return nothing
    
    
def get_heads(phrase):
    '''
    Returns substantive head nouns, if there are any, from a phrase node.
    "substantive" does not include prounouns.
    
    Based on a supplied phrase get phrase atom and subphrase features 
    and compare them against a group of sets.
    Define those sets first. Then make the comparison.
    
    *Caution* 
    This function works reasonably well,
    but there are a number of edge cases that it does not catch.
    Fine-tuning this function would make a nice notebook in itself.
    See Gen 20:5 for a good edge case example, in which both היא pronouns
    are registered as subjects, but only one should be.
    '''
    
    good_sp = {'subs', 'nmpr', 'adjv'}
    good_pdp = {'subs', 'nmpr'}

    # exclude words in phrase_atoms with these relation features
    omit_pa_rela = {'Appo', # apposition
                    'Spec'} # specification
    
    # exclude words in subphrases with these relation features
    omit_sp_rela = {'rec', # nomen rectum
                    'adj', # adjunct 
                    'atr', # attributive
                    'mod', # modifier
                    'dem'} # demontrative
        
    heads = [] # nouns go here
    phrase_words = L.d(phrase, 'word')
        
    for word in phrase_words:
        
        # get phrases's phrase atoms, subphrases, and subphrase relations
        phrase_atom = L.u(word, 'phrase_atom')[0]
        subphrases = L.u(word, 'subphrase') 
        sp_relas = set(F.rela.v(sp) for sp in subphrases)

        # compare word/phrase features
        if all([
                F.pdp.v(word) in good_sp, # is noun
                F.sp.v(word) in good_pdp, # is noun
                F.typ.v(phrase_atom) == 'NP' or is_preposition_subj(word), # is noun phrase or prep. subj.
                F.rela.v(phrase_atom) not in omit_pa_rela, # is valid hrase_atom rela.
                not sp_relas & omit_sp_rela, # is valid subphrase rela.
               ]):
        
            # handle כֹל constructs by retrieving their noun:
            if F.lex.v(word) == 'KL/':
                KL_head = get_KL_head(word, good_pdp, good_sp) # returns word node or None
                if KL_head:
                    heads.append(KL_head) # כֹל + noun found
                else:
                    continue # no noun found, skip it
            else:
                heads.append(word) # word is a head
    
        else:
            continue
            
    return heads

In [143]:
# results = []
# ct = 0

# for phrase in F.function.s('Objc'):
    
#     heads = get_heads(phrase)
    
#     if heads and len(heads) < len(L.d(phrase, 'word')):
        
#         print(T.sectionFromNode(phrase))
#         print(T.text(L.d(L.u(phrase, 'clause')[0], 'word')))
#         print(phrase)
#         print(T.text(L.d(phrase, 'word')))
#         heads = [T.text([head]) for head in heads]
#         print(' | '.join(heads))
#         print()
        
#         ct += 1
        
#         if ct == 100:
#             break

In [28]:
# for i, word in enumerate(F.lex.s('KL/')):
        
#     daughter = E.mother.t(word)
    
#     if F.otype.v(word) == 'lex':
#         continue
    
#     phrase = L.u(word, 'phrase')[0]
    
#     if F.function.v(phrase) not in {'Subj', 'Objc'}:
#         continue
    
#     heads = get_heads(phrase)
#     lemmas = list(F.lex.v(w) for w in L.d(phrase, 'word'))
    
#     if heads and len(heads) > 1 and lemmas.count('KL/') == 1:
                
#         print(T.sectionFromNode(word))
#         print(T.text(L.d(phrase, 'word')), phrase)
#         print(T.text(L.d(daughter[0], 'word')))
#         print('|'.join(T.text([n]) for n in heads))
#         print()
        
#     if i > 200:
#         break

In [142]:
# tests = list()

# for i, word in enumerate(F.lex.s('KL/')):
            
#     if F.otype.v(word) == 'lex':
#         continue
    
#     phrase = L.u(word, 'phrase')[0]
    
#     if F.function.v(phrase) not in {'Subj', 'Objc'}:
#         continue
    
#     subphrases = L.u(word, 'subphrase')
    
#     for sp in subphrases:
        
#         daughter = E.mother.t(sp)[0] if E.mother.t(sp) else None
        
#         if not daughter:
#             continue
        
#         d_words = set(F.lex.v(w) for w in L.d(daughter, 'word'))
        
#         if F.rela.v(daughter) == 'par' and not d_words & {'KL/'}:
            
#             heads = get_heads(phrase)
            
#             print(T.sectionFromNode(word))
#             print(phrase, T.text(L.d(phrase, 'word')))
#             print('sp: ', T.text(L.d(sp, 'word')))
#             print('par: ', T.text(L.d(daughter, 'word')))
#             print('|'.join(T.text([n]) for n in heads))
#             print()
            