# Context Selection Development

**TODO 12.03.18**

Current the phrase type function only checks at subphrase, since I wrongly assumed every phrase atom has a subphrase (they don't). 

Thus, the phrase type function should first look for a phrase atom and then isolate the prepositional phrase.

To be determined:
    * if a phrase atom has no subphrase, does that also mean there will not be more than one preposition? That assumption SHOULD be true, since an additional preposition would introduce a relation and hence require a subphrase.
    * if the above is true, then selecting the prepositino from a phrase atom is very simple:

> `w for w in L.d(phrase_atom, 'word') if F.pdp.v(w) == 'prep'`



In [1]:
# First, I load the necesssary modules, data, and helper functions.
import collections
from tf.fabric import Fabric
from functions.helpers import show_results, filter_results

# load BHSA data into TF
TF = Fabric(locations='~/github/etcbc/bhsa/tf', modules='c', silent=True)
api = TF.load('''
                book chapter verse
                function sp pdp mother
                rela typ lex
              ''', silent=True)
api.makeAvailableIn(globals()) # globalize TF methods

In [234]:
def good_phrs_type(phrase_atom, subphrases):
    '''
    Return boolean on whether a phrase atom is an acceptable type.
    Acceptable is either a noun phrase (NP) or
    a prepositional phrase (PP) that is governed only by את.
    '''

    if F.typ.v(phrase_atom) == 'NP': # noun phrase
        return True
    
    # check for subphrase with preposition
    # necessary since this function is called at word level
    # i.e. some words are in phrase atom that is PP but not in sp with prep.
    sp = [sp for sp in subphrases
             if 'prep' in set(F.pdp.v(w) for w in L.d(sp, 'word'))]
    
    if F.typ.v(phrase_atom) == 'PP' and sp: # check for את
        
        # !! TO FIX: NOT ALL PHRASE ATOMS HAVE A SUBPHRASE
        # IN THESE CASES, THE ONLY WAY TO CHECK FOR >T 
        # IS WITH THE PHRASE ATOM ALONE
        
        subphrase = sp[0]
        
        preps = [w for w in L.d(subphrase, 'word') 
                     if F.pdp.v(w) == 'prep']
        
#         if len(preps) > 1: # sanity check
#             raise Exception(f'Check multiple preps at subphrase {subphrase}')
        
        if F.lex.v(preps[0]) == '>T':
            return True
        else:
            return False
    else:
        return False

def get_KL_head(KL_wordnode, good_pdp, good_sp):
    '''
    Extract the head noun in a כֹל construct chain.
    The function simply returns the first substantive in the chain.
    '''
    
    rectum = E.mother.t(KL_wordnode) # get rectum subphrase
    KL_phrase = L.d(L.u(KL_wordnode, 'phrase')[0], 'word') # for phrase boundary
    
    if not rectum:
        return False  # KL not in norm. construct (e.g. w/ verbs)
    
    # get words and nouns in the rectum subphrase
    r_words = L.d(rectum[0], 'word')
    r_nouns = [w for w in r_words 
               if F.sp.v(w) in good_sp
               and F.pdp.v(w) in good_pdp
               and w in KL_phrase
              ]
    
    if r_nouns:
        return r_nouns[0] # return the first noun
    else:
        return None # no noun found, return nothing
    
    
def get_heads(phrase):
    '''
    Returns substantive head nouns, if there are any, from a phrase node.
    "substantive" does not include prounouns.
    
    Based on a supplied phrase get phrase atom and subphrase features 
    and compare them against a group of sets.
    Define those sets first. Then make the comparison.
    
    *Caution* 
    This function works reasonably well,
    but there are a number of edge cases that it does not catch.
    Fine-tuning this function would make a nice notebook in itself.
    See Gen 20:5 for a good edge case example, in which both היא pronouns
    are registered as subjects, but only one should be.
    '''
    
    good_sp = {'subs', 'nmpr', 'adjv'}
    good_pdp = {'subs', 'nmpr'}

    # exclude words in phrase_atoms with these relation features
    omit_pa_rela = {'Appo', # apposition
                    'Spec'} # specification
    
    # exclude words in subphrases with these relation features
    omit_sp_rela = {'rec', # nomen rectum
                    'adj', # adjunct 
                    'atr', # attributive
                    'mod', # modifier
                    'dem'} # demontrative
        
    heads = [] # nouns go here
    phrase_words = L.d(phrase, 'word')
        
    for word in phrase_words:
        
        # get phrases's phrase atoms, subphrases, and subphrase relations
        phrase_atom = L.u(word, 'phrase_atom')[0]
        subphrases = L.u(word, 'subphrase') 
        sp_relas = set(F.rela.v(sp) for sp in subphrases)

        # compare word/phrase features
        if all([
                F.pdp.v(word) in good_pdp, # is noun
                F.sp.v(word) in good_sp, # is noun
                good_phrs_type(phrase_atom, subphrases), # is NP or PP with את
                F.rela.v(phrase_atom) not in omit_pa_rela, # is valid hrase_atom rela.
                not sp_relas & omit_sp_rela, # is valid subphrase rela.
               ]):
        
            # handle כֹל constructs by retrieving their noun:
            if F.lex.v(word) == 'KL/':
                KL_head = get_KL_head(word, good_pdp, good_sp) # returns word node or None
                if KL_head:
                    heads.append(KL_head) # כֹל + noun found
                else:
                    continue # no noun found, skip it
            else:
                heads.append(word) # word is a head
    
        else:
            continue
            
    return heads

## Subject and Object Omissions

A previous version of the valid preposition function only identified nouns from prepositional phrase atoms that spanned the entire functional phrase. That omits cases such as Josh 24:18 with constructs:
> ('Joshua', 24, 18) <br>
> 722643 אֶת־כָּל־הָעַמִּ֗ים וְאֶת־הָאֱמֹרִ֛י <br>

The search below identifies phrases that begin with a preposition besides את and function as an object or subject (N.B. the prep. את can mark subjects in passive constructions). These are the cases which will be excluded by the new version of the function. A survey of these cases confirms that none of them contain nouns that are of interest: that is, none of these prepositions appear to grammaticaly mark a subject or object, but appear to be specifiers. 

For the old function, see `is_preposition_subj` in the [old version](https://github.com/codykingham/tfNotebooks/blob/master/4Q246_Participants/participant_functions/subjects.py).

In [190]:
targets = []

for phrase in F.otype.s('phrase'):
    
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
    
    phrase_atoms = L.d(phrase, 'phrase_atom')
    pa_lex = set(F.lex.v(w) for w in L.d(phrase_atoms[0], 'word'))
        
    if F.typ.v(phrase_atoms[0]) == 'PP' and '>T' not in pa_lex:
        
        targets.append((L.u(phrase, 'clause')[0], phrase, phrase_atoms[0]))
        
show_results(targets, limit=5, highlight=[1])

93 results



-------------------- 



-------------------- 



-------------------- 



-------------------- 



-------------------- 

results cut off at 5


Further inspection of subject or object phrases that do not begin with את but contain את later on in the phrase shows that most of these are cases of adjectival specification, with a few parallel relationships reflected. In particular, it was important to be sure that את in the adjectival sense, especially where it has the sense of "with" rather than a grammatical one, would be excluded from the noun selector. Many of the finds by this search were more appositional than used in this "with sense". But a handful were found. 1 Chronicles 20:5, broken down below this search, confirms that את phrases in this adjectival sense are marked as `Spec` for specification. Thus, it is completely safe in the preposition parser function to take any את prepositional phrase. The acceptable subphrase relation set will then eliminate any match that is a specifier.

In [191]:
targets = []

for phrase in F.otype.s('phrase'):
    
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
    
    phrase_atoms = L.d(phrase, 'phrase_atom')
    first_pa = phrase_atoms[0]
    f_pa_lex = set(F.lex.v(w) for w in L.d(first_pa, 'word'))
    
    if '>T' in f_pa_lex:
        continue
    
    for i in range(1, len(phrase_atoms)):
        
        other_pa = phrase_atoms[i]
        pa_lex = set(F.lex.v(w) for w in L.d(other_pa, 'word'))
        
        if F.typ.v(other_pa) == 'PP' and '>T' in pa_lex:

            targets.append((L.u(phrase, 'clause')[0], phrase, other_pa))
            break
        
show_results(targets, limit=5, highlight=[1, 2])

54 results



-------------------- 



-------------------- 



-------------------- 



-------------------- 



-------------------- 

results cut off at 5


As seen below, "war *with Philistines*" is marked as `Spec`. 

In [192]:
# ("1_Chronicles", 20, 5)
for sp in (L.d(892236, 'phrase_atom')):
    print(sp)
    print(T.text(L.d(sp, 'word')))
    print(F.rela.v(sp))
    print()

1158377
מִלְחָמָ֖ה 
NA

1158378
אֶת־פְּלִשְׁתִּ֑ים 
Spec



### Ordering of Subphrases on a L.u call from Word Node

This search demonstrates that subphrase nodes are indeed ordered by size when called from a word node, i.e. subphrases that contain less words receive smaller node numbers. If they are equal in size, either one might have the lower node number.

It is indeed acceptable to select the first subphrase on a `L.u` call from a word for preposition testing. That is the subphrase which will be closest to the word level.

In [300]:
not_true = []
true = 0
no_sp = 0

for word in F.otype.s('word'):
    
    subphrases = sorted(L.u(word, 'subphrase'))
    
    if not subphrases:
        no_sp += 1
        continue
        
    sp_len = sorted((len(L.d(sp, 'word')), sp) for sp in subphrases) # sort by word length
    sp_check = [sp[1] for sp in sp_len] # iterate over sorted list and grab subphrase nodes
    
    if sp_check == subphrases: # check them
        true += 1
        
    else:
        not_true.append(word)

len(not_true)

0

## Testing the Function

In [193]:
# test the function

results = []
ct = 0

for phrase in F.function.s('Objc'):
    
    heads = get_heads(phrase)
    
    if heads and len(heads) < len(L.d(phrase, 'word')):
        
        print(T.sectionFromNode(phrase))
        print(T.text(L.d(L.u(phrase, 'clause')[0], 'word')))
        print(phrase)
        print(T.text(L.d(phrase, 'word')))
        heads = [T.text([head]) for head in heads]
        print(' | '.join(heads))
        print()
        
        ct += 1
        
        if ct == 100:
            break

('Genesis', 1, 1)
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ 
651506
אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ 
שָּׁמַ֖יִם  | אָֽרֶץ׃ 

('Genesis', 1, 11)
תַּֽדְשֵׁ֤א הָאָ֨רֶץ֙ דֶּ֔שֶׁא עֵ֚שֶׂב עֵ֣ץ פְּרִ֞י 
651619
דֶּ֔שֶׁא עֵ֚שֶׂב עֵ֣ץ פְּרִ֞י 
דֶּ֔שֶׁא  | עֵ֚שֶׂב  | עֵ֣ץ 

('Genesis', 1, 12)
וַתֹּוצֵ֨א הָאָ֜רֶץ דֶּ֠שֶׁא עֵ֣שֶׂב וְעֵ֧ץ 
651635
דֶּ֠שֶׁא עֵ֣שֶׂב וְעֵ֧ץ 
דֶּ֠שֶׁא  | עֵ֣שֶׂב  | עֵ֧ץ 

('Genesis', 1, 20)
יִשְׁרְצ֣וּ הַמַּ֔יִם שֶׁ֖רֶץ נֶ֣פֶשׁ חַיָּ֑ה 
651711
שֶׁ֖רֶץ נֶ֣פֶשׁ חַיָּ֑ה 
שֶׁ֖רֶץ 

('Genesis', 1, 21)
וַיִּבְרָ֣א אֱלֹהִ֔ים אֶת־הַתַּנִּינִ֖ם הַגְּדֹלִ֑ים וְאֵ֣ת כָּל־נֶ֣פֶשׁ לְמִֽינֵהֶ֗ם 
651719
אֶת־הַתַּנִּינִ֖ם הַגְּדֹלִ֑ים וְאֵ֣ת כָּל־נֶ֣פֶשׁ 
תַּנִּינִ֖ם  | נֶ֣פֶשׁ 

('Genesis', 1, 24)
תֹּוצֵ֨א הָאָ֜רֶץ נֶ֤פֶשׁ חַיָּה֙ לְמִינָ֔הּ 
651763
נֶ֤פֶשׁ חַיָּה֙ 
נֶ֤פֶשׁ 

('Genesis', 1, 24)
בְּהֵמָ֥ה וָרֶ֛מֶשׂ וְחַֽיְתֹו־אֶ֖רֶץ לְמִינָ֑הּ 
651765
בְּהֵמָ֥ה וָרֶ֛מֶשׂ וְחַֽיְתֹו־אֶ֖רֶץ 
בְּהֵמָ֥ה  | רֶ֛מֶשׂ  | חַֽיְתֹו־

('Genesis', 2, 4)
עֲשֹׂ֛ות יְהוָ֥ה 

Exception: Check multiple preps at subphrase 1301482

In [28]:
# for i, word in enumerate(F.lex.s('KL/')):
        
#     daughter = E.mother.t(word)
    
#     if F.otype.v(word) == 'lex':
#         continue
    
#     phrase = L.u(word, 'phrase')[0]
    
#     if F.function.v(phrase) not in {'Subj', 'Objc'}:
#         continue
    
#     heads = get_heads(phrase)
#     lemmas = list(F.lex.v(w) for w in L.d(phrase, 'word'))
    
#     if heads and len(heads) > 1 and lemmas.count('KL/') == 1:
                
#         print(T.sectionFromNode(word))
#         print(T.text(L.d(phrase, 'word')), phrase)
#         print(T.text(L.d(daughter[0], 'word')))
#         print('|'.join(T.text([n]) for n in heads))
#         print()
        
#     if i > 200:
#         break

In [142]:
# tests = list()

# for i, word in enumerate(F.lex.s('KL/')):
            
#     if F.otype.v(word) == 'lex':
#         continue
    
#     phrase = L.u(word, 'phrase')[0]
    
#     if F.function.v(phrase) not in {'Subj', 'Objc'}:
#         continue
    
#     subphrases = L.u(word, 'subphrase')
    
#     for sp in subphrases:
        
#         daughter = E.mother.t(sp)[0] if E.mother.t(sp) else None
        
#         if not daughter:
#             continue
        
#         d_words = set(F.lex.v(w) for w in L.d(daughter, 'word'))
        
#         if F.rela.v(daughter) == 'par' and not d_words & {'KL/'}:
            
#             heads = get_heads(phrase)
            
#             print(T.sectionFromNode(word))
#             print(phrase, T.text(L.d(phrase, 'word')))
#             print('sp: ', T.text(L.d(sp, 'word')))
#             print('par: ', T.text(L.d(daughter, 'word')))
#             print('|'.join(T.text([n]) for n in heads))
#             print()
            

In [296]:
# for phrase in F.typ.s('PP'):
    
#     words = L.d(phrase, 'word')
    
#     if F.function.v(phrase) not in ('Subj', 'Objc'):
#         continue
    
#     if F.pdp.v(words[0]) != 'prep':
        
#         clause = L.u(phrase, 'clause')[0]
        
#         print(T.sectionFromNode(phrase))
#         print(T.text(L.d(clause, 'word')))
#         print(phrase)
#         print(T.text(L.d(phrase, 'word')))
#         print([T.text([w]) for w in words if F.pdp.v(w) == 'prep'])
        
#         print()