# Context Selection Development

In [1]:
# First, I load the necesssary modules, data, and helper functions.
import collections, random
from tf.fabric import Fabric
from functions.helpers import show_results, filter_results

# load BHSA data into TF
TF = Fabric(locations=['~/github/etcbc/bhsa/tf', '~/github/semantics/tf'], modules='c')
api = TF.load('''
                book chapter verse
                function sp pdp mother
                rela typ lex ls 
              ''')
api.makeAvailableIn(globals()) # globalize TF methods

This is Text-Fabric 3.2.2
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

115 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.06s B function             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.11s B sp                   from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.11s B pdp                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.18s B mother               from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.18s B rela                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.18s B typ                  from /Users/cody/github/etcbc/bh

# Head Noun Extraction

In [2]:
def good_phrs_type(phrase_atom, subphrases, diagnose=False):
    '''
    Return boolean on whether a phrase atom is an acceptable type.
    Acceptable is either a noun phrase (NP) or
    a prepositional phrase (PP) that is governed only by את.
    '''

    if F.typ.v(phrase_atom) == 'NP': # noun phrase
        return True
    
    # for logic on this selection criteria, see [?]
    prep_sp = sorted(sp for sp in subphrases # sorted sp with prepositions
                         if 'prep' in set(F.pdp.v(w) for w in L.d(sp, 'word')))
    phrase_type = prep_sp or (phrase_atom,)
    preps = [w for w in L.d(phrase_type[0], 'word') 
                    if F.pdp.v(w) == 'prep']
    
    if F.typ.v(phrase_atom) == 'PP' and preps: # check for את
        prep = preps[0]
        if F.lex.v(prep) == '>T':
            return True
        else:
            if diagnose:
                print('>T not found...')
            return False
    else:
        if diagnose:
            print('neither NP or PP...')
            print('phrase_type: ', phrase_type)
        return False

def get_genitive(abs_wnode, good_pdp, good_sp, diagnose=False):
    '''
    Extract the head noun in a כֹל construct chain.
    The function simply returns the first substantive in the chain.
    '''
    
    rectum = E.mother.t(abs_wnode) # get rectum subphrase
    abs_phrase = L.d(L.u(abs_wnode, 'phrase')[0], 'word') # for phrase boundary
    
    if not rectum:
        if diagnose:
            print('no rectum found at word', abs_wnode)
        return None  # abs not in norm. construct (e.g. w/ verbs)
    
    # get words and nouns in the rectum subphrase
    r_words = L.d(rectum[0], 'word')
    r_nouns = [w for w in r_words 
                   if F.sp.v(w) in good_sp
                   and F.pdp.v(w) in good_pdp
                   and w in abs_phrase]
    if r_nouns:
        return r_nouns[0] # return the first noun
    else:
        if diagnose:
            print('no noun found for word', abs_wnode)
        return None # no noun found, return nothing
    
def independent(phrase_atom, subphrases, heads_list):
    
    '''
    Checks phrase and subphrase relations for dependency relations.
    Requires a list of previously analyzed head nouns.
    This list is required to double check parallel (coordinate) relations. 
    '''
    # exclude words in phrase_atoms with these relation features
    omit_pa_rela = {'Appo', # apposition
                    'Spec'} # specification
    
    # exclude words in subphrases with these relation features
    omit_sp_rela = {'rec', # nomen rectum
                    'adj', # adjunct 
                    'atr', # attributive
                    'mod', # modifier
                    'dem'} # demontrative
    
    parallels = {'par', 'Para'} # parallel i.e. coordination specification
    omit_relas = omit_pa_rela | omit_sp_rela
    phrase_units = list(subphrases) + [phrase_atom]
    
    relas = set(F.rela.v(obj) for obj in list(subphrases) + [phrase_atom])
    
    if not relas & omit_relas and not parallels & relas: # good relas
        return True
    
    elif not relas & omit_relas and parallels & relas: # check parallel relations
        
        # assemble acceptable phrase mothers from the already accepted head nouns
        head_mothers = set(L.u(w, 'phrase_atom')[0] for w in heads_list)
        head_mothers |= set(L.u(w, 'subphrase') for w in heads_list)
        
        for pu in phrase_units:
            if F.rela.v(pu) in parallels:
                mother = E.mother.f(pu)[0]
                if mother in head_mothers:
                    return True
                else:
                    return False
                
    else: # noun is not independent
        return False  
        
    
def get_heads(phrase, diagnose=False):
    '''
    Returns substantive head nouns, if there are any, from a phrase node.
    "substantive" does not include prounouns.
    
    Based on a supplied phrase get phrase atom and subphrase features 
    and compare them against a group of sets.
    Define those sets first. Then make the comparison.
    
    *Caution* 
    This function works reasonably well,
    but there are a number of edge cases that it does not catch.
    Fine-tuning this function would make a nice notebook in itself.
    See Gen 20:5 for a good edge case example, in which both היא pronouns
    are registered as subjects, but only one should be.
    '''
    
    good_sp = {'subs', 'nmpr', 'adjv'}
    good_pdp = {'subs', 'nmpr'}
        
    heads = [] # nouns go here
    phrase_words = L.d(phrase, 'word')
        
    for word in phrase_words:
        
        # get phrases's phrase atoms, subphrases, and subphrase relations
        phrase_atom = L.u(word, 'phrase_atom')[0]
        subphrases = L.u(word, 'subphrase') 
        sp_relas = set(F.rela.v(sp) for sp in subphrases)
        
        test_good = [F.pdp.v(word) in good_pdp, # is noun
                     F.sp.v(word) in good_sp, # is noun
                     good_phrs_type(phrase_atom, subphrases, diagnose), # is NP or PP with את
                     independent(phrase_atom, subphrases, heads) 
                    ] # is valid subphrase rela.
        
        # compare word/phrase features
        if all(test_good):
        
            # handle quantifiers
            if F.lex.v(word) == 'KL/' or F.ls.v(word) == 'card':
                genitive_head = get_genitive(word, good_pdp, good_sp, diagnose) # returns word node or None
                if genitive_head:
                    heads.append(genitive_head) # valid quantified noun found
                else:
                    continue # no noun found, skip it
            else:
                heads.append(word) # word is a head
    
        else:
            if diagnose: 
                print(T.text([word]), word)
                print('test_good', tuple(zip(test_good, ('pdp', 'sp', 'phr_typ', 'pAt_rela', 'sp_rela'))))
                print('subphrases', subphrases)
                print('phrase_atom', phrase_atom)
                print()
            continue
            
    return heads

## Subject and Object Omissions

A previous version of the valid preposition function only identified nouns from prepositional phrase atoms that spanned the entire functional phrase. That omits cases such as Josh 24:18 with constructs:
> ('Joshua', 24, 18) <br>
> 722643 אֶת־כָּל־הָעַמִּ֗ים וְאֶת־הָאֱמֹרִ֛י <br>

The search below identifies phrases that begin with a preposition besides את and function as an object or subject (N.B. the prep. את can mark subjects in passive constructions). These are the cases which will be excluded by the new version of the function. A survey of these cases confirms that none of them contain nouns that are of interest: that is, none of these prepositions appear to grammaticaly mark a subject or object, but appear to be specifiers. 

For the old function, see `is_preposition_subj` in the [old version](https://github.com/codykingham/tfNotebooks/blob/master/4Q246_Participants/participant_functions/subjects.py).

In [3]:
targets = []

for phrase in F.otype.s('phrase'):
    
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
    
    phrase_atoms = L.d(phrase, 'phrase_atom')
    pa_lex = set(F.lex.v(w) for w in L.d(phrase_atoms[0], 'word'))
        
    if F.typ.v(phrase_atoms[0]) == 'PP' and '>T' not in pa_lex:
        
        targets.append((L.u(phrase, 'clause')[0], phrase, phrase_atoms[0]))
        
show_results(targets, limit=5, highlight=[1])

93 results



-------------------- 



-------------------- 



-------------------- 



-------------------- 



-------------------- 

results cut off at 5


Further inspection of subject or object phrases that do not begin with את but contain את later on in the phrase shows that most of these are cases of adjectival specification, with a few parallel relationships reflected. In particular, it was important to be sure that את in the adjectival sense, especially where it has the sense of "with" rather than a grammatical one, would be excluded from the noun selector. Many of the finds by this search were more appositional than used in this "with sense". But a handful were found. 1 Chronicles 20:5, broken down below this search, confirms that את phrases in this adjectival sense are marked as `Spec` for specification. Thus, it is completely safe in the preposition parser function to take any את prepositional phrase. The acceptable subphrase relation set will then eliminate any match that is a specifier.

In [4]:
targets = []

for phrase in F.otype.s('phrase'):
    
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
    
    phrase_atoms = L.d(phrase, 'phrase_atom')
    first_pa = phrase_atoms[0]
    f_pa_lex = set(F.lex.v(w) for w in L.d(first_pa, 'word'))
    
    if '>T' in f_pa_lex:
        continue
    
    for i in range(1, len(phrase_atoms)):
        
        other_pa = phrase_atoms[i]
        pa_lex = set(F.lex.v(w) for w in L.d(other_pa, 'word'))
        
        if F.typ.v(other_pa) == 'PP' and '>T' in pa_lex:

            targets.append((L.u(phrase, 'clause')[0], phrase, other_pa))
            break
        
show_results(targets, limit=5, highlight=[1, 2])

54 results



-------------------- 



-------------------- 



-------------------- 



-------------------- 



-------------------- 

results cut off at 5


As seen below, "war *with Philistines*" is marked as `Spec`. 

In [5]:
# ("1_Chronicles", 20, 5)
for sp in (L.d(892236, 'phrase_atom')):
    print(sp)
    print(T.text(L.d(sp, 'word')))
    print(F.rela.v(sp))
    print()

1158377
מִלְחָמָ֖ה 
NA

1158378
אֶת־פְּלִשְׁתִּ֑ים 
Spec



### Ordering of Subphrases on a L.u call from Word Node

This search demonstrates that subphrase nodes are indeed ordered by size when called from a word node, i.e. subphrases that contain less words receive smaller node numbers. If they are equal in size, either one might have the lower node number.

It is indeed acceptable to select the first subphrase on a `L.u` call from a word for preposition testing. That is the subphrase which will be closest to the word level.

In [6]:
not_true = []
true = 0
no_sp = 0

for word in F.otype.s('word'):
    
    subphrases = sorted(L.u(word, 'subphrase'))
    
    if not subphrases:
        no_sp += 1
        continue
        
    sp_len = sorted((len(L.d(sp, 'word')), sp) for sp in subphrases) # sort by word length
    sp_check = [sp[1] for sp in sp_len] # iterate over sorted list and grab subphrase nodes
    
    if sp_check == subphrases: # check them
        true += 1
        
    else:
        not_true.append(word)

len(not_true)

0

## Testing the Function

In [7]:
# # test the function

# results = []
# ct = 0

# object_phrases = list(F.function.s('Objc'))

# random.shuffle(object_phrases)

# for phrase in object_phrases:
    
#     heads = get_heads(phrase)
    
#     if heads and len(heads) < len(L.d(phrase, 'word')):
        
#         print(T.sectionFromNode(phrase))
#         print(T.text(L.d(L.u(phrase, 'clause')[0], 'word')))
#         print(phrase)
#         print(T.text(L.d(phrase, 'word')))
#         heads = [T.text([head]) for head in heads]
#         print(' | '.join(heads))
#         print()
        
#         ct += 1
        
#         if ct == 100:
#             break

### How Many Phrases Does it Validate?

In [8]:
phrase_counts = collections.Counter()

for phrase in F.otype.s('phrase'):
    
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
        
    heads = get_heads(phrase)
    pdps = set(F.pdp.v(w) for w in L.d(phrase))
    
    phrase_counts['total'] += 1
    
    if heads:
        phrase_counts['heads_found'] += 1
        
    elif pdps & {'prps', 'prde', 'prin'}:
        phrase_counts['pronoun_excluded'] += 1
    
    elif pdps & {'intj'}:
        phrase_counts['interjection_excluded']
    
    else:
        phrase_counts['unknown'] += 1     
        
phrase_counts

Counter({'heads_found': 36417,
         'pronoun_excluded': 5740,
         'total': 54599,
         'unknown': 12441})

### Push to TF as Edges

Apply the function phrase atoms and phrases that serve as subjects or objects in the BHSA. The relationships will be pushed out to TF as an edge relation from a phrase to its head noun word nodes.

In [12]:
meta = {'': {'created_by': 'Cody Kingham',
             'coreData': 'BHSA',
             'coreVersion': 'c'
            },
        'heads' : {'source': 'see the notebooks at https://github.com/codykingham/semantics',
                  'valueType': 'int',
                  'edgeValues': False}
       }

heads = {}

for phrase in F.otype.s('phrase'):
    
    # only push features for these two types for now
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
    
    phrase_heads = get_heads(phrase)
    
    if phrase_heads:
        heads[phrase] = set(phrase_heads)
    
    for phrase_atom in L.d(phrase, 'phrase_atom'):
        phraseAt_heads = get_heads(phrase_atom)
        if phraseAt_heads:
            heads[phrase_atom] = set(phraseAt_heads)
        
new_edges = {'heads': heads}

saveTF = Fabric('tf/c')
saveTF.save(nodeFeatures={}, edgeFeatures=new_edges, metaData=meta)

This is Text-Fabric 3.2.2
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

0 features found and 0 ignored


  0.00s Warp feature "otype" not found in

  0.00s Warp feature "oslots" not found in



  0.00s Warp feature "otext" not found. Working without Text-API

  0.00s Exporting 0 node and 1 edge and 0 config features to tf/c:
   |     0.20s T heads                to tf/c
  0.20s Exported 0 node features and 1 edge features and 0 config features to tf/c


### Test Heads Feature

In [10]:
TF.load('heads', add=True)

  0.00s loading features ...


   |     0.00s Feature "heads" not available in
   |   /Users/cody/github/etcbc/bhsa/tf/c
   |   	/Users/cody/github/semantics/tf/c
  0.01s Not all features could be loaded/computed


   |     0.00s M otext                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.03s B otype                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.51s B oslots               from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s M otext                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.12s B g_cons               from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.16s B g_cons_utf8          from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.12s B g_lex                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.18s B g_lex_utf8           from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.14s B g_word               from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.18s B g_word_utf8          from /Users/cody/github/et

In [11]:
ct = 0 

for phrase in F.otype.s('phrase'):
    
    if F.function.v(phrase) not in {'Subj', 'Objc'}:
        continue
        
    if E.heads.f(phrase):
        
        clause = L.u(phrase, 'clause')[0]
        
        print(T.sectionFromNode(phrase))
        print(T.text(L.d(clause, 'word')))
        print(T.text(L.d(phrase, 'word')))
        print(E.heads.f(phrase))
        print(T.text(E.heads.f(phrase)))
        print()
    
    if ct > 100:
        break
        
    ct += 1

AttributeError: 'EdgeFeatures' object has no attribute 'heads'