# Template Lab

Evaluating and constructing TF search templates

In [1]:
import collections, os, sys, random, re
from tf.fabric import Fabric
from tf.extra.bhsa import Bhsa
os.sys.path.append('..')
from experiments2 import Experiment
from semspace import SemSpace

bhsa_data_paths=['~/github/etcbc/bhsa/tf/c',
                 '~/github/verb_semantics/project_code/lingo/heads/tf/c',
                 '~/github/verb_semantics/project_code/sdbh']

TF = Fabric(bhsa_data_paths)
tf_api = TF.load('''
                function lex vs language
                pdp freq_lex gloss domain ls
                mother rela typ sp st code txt
                heads prep_obj
                prs prs_gn prs_nu prs_ps
                sem_domain sem_domain_code
              ''', silent=True)

tf_api.makeAvailableIn(globals())
B = Bhsa(api=tf_api, name='', version='c')

This is Text-Fabric 4.3.4
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

118 features found and 0 ignored


**Documentation:** <a target="_blank" href="https://etcbc.github.io/bhsa" title="{provenance of this corpus}">BHSA</a> <a target="_blank" href="https://etcbc.github.io/bhsa/features/hebrew/c/0_home.html" title="{CORPUS} feature documentation">Feature docs</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/Bhsa/" title="BHSA API documentation">BHSA API</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/" title="text-fabric-api">Text-Fabric API 4.3.4</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/#search-templates" title="Search Templates Introduction and Reference">Search Reference</a>


This notebook online:
<a target="_blank" href="http://nbviewer.jupyter.org/github/verb_semantics/project_code/blob/master/datareview/.ipynb">NBViewer</a>
<a target="_blank" href="https://github.com/verb_semantics/project_code/blob/master/datareview/.ipynb">GitHub</a>


In [96]:
# standard predicate target template

pred_target = '''

c1:clause
    p1:phrase

    /with/
    clause typ#Ptcp
        p:phrase function={pred_funct}
            -heads> word pdp=verb language=Hebrew
        p = p1
    /or/
    clause typ=Ptcp
        p:phrase function=PreC|{pred_funct}
            -heads> word pdp=verb language=Hebrew
        p = p1
    /-/

        target:word pdp=verb
    
{basis}

lex freq_lex>9
   lexword:word 
   lexword = target
'''

all_preds = 'Pred|PreO|PreS|PtcO' # all predicate phrase functions

def verb_token(target):
    # standard verb target tokenizer
    vs = F.vs.v(target)
    lex = F.lex.v(target)
    return f'{lex}.{vs}'

good_sem_codes = '1\.00[1-3][0-9]*|2\.[0-9]*' # SDBH codes: objects, events, referents, contexts

# ordered in terms of selection preferences, select animate first, etc.
code_priorities = (('(1\.001001[0-9]*)',  # ANIMATE
                   '(1\.00300100[3,6])', 
                   '(1\.00300101[0,3])',
                   '(2\.075[0-9]*)'),

                  ('(1\.00100[2-6][0-9]*)',  # INANIMATE
                   '(1\.00300100[1-2, 4, 7-9])',
                   '(1\.00300101[1-2])',
                   '(1\.00[1,3]$)',
                   '(1\.00[1,3])\|',
                   '(1\.003001$)',
                   '(1\.003001)\|',
                   '(1\.003001005$)', # names of groups (!)
                   '(1\.003001005)\|',
                   '(2\.[0-9]*)'), # frames
    
                  ('(1\.002[0-9]*)', # EVENTS
                   '(1\.003002[0-9]*)',
                   '(1\.002$)',
                   '(1\.002)\|'))

def code2tag(code):
    '''
    Maps SDBH semantic domains to three basic codes:
    animate, inanimate, and events. These codes are
    of interest to the semantic content of a verb.
    '''
    
    animate = '|'.join(code_priorities[0])
    inanimate = '|'.join(code_priorities[1])
    events = '|'.join(code_priorities[2])
    
    if re.search(animate, code):
        return 'animate'
    elif re.search(inanimate, code):
        return 'inanimate'
    elif re.search(events, code):
        return 'event'
    else:
        raise Exception(code) # avoid accidental selections

        
def code2domain(word):
    '''
    Selects the prefered SDBH semantic domain code
    and maps it to the longer form domain.
    '''
    
    code = F.sem_domain_code.v(word)
    domain = F.sem_domain.v(word)
    animate = '|'.join(code_priorities[0])
    inanimate = '|'.join(code_priorities[1])
    events = '|'.join(code_priorities[2])
    try:
        if re.search(animate, code):
            match = next(match for group in re.findall(animate, code) for match in group if match)
            code_index = code.split('|').index(match)
            return domain.split('|')[code_index]

        elif re.search(inanimate, code):
            match = next(match for group in re.findall(inanimate, code) for match in group if match)
            code_index = code.split('|').index(match)
            return domain.split('|')[code_index]

        elif re.search(events, code):
            match = next(match for group in re.findall(events, code) for match in group if match)
            code_index = code.split('|').index(match)   
            return domain.split('|')[code_index]
        else:
            raise Exception(code) # avoid accidental selections
    except:
        raise Exception(word)

    
def domainer(basis, target):
    # basis tokenizer for semantic domains
    sem_category = code2tag(F.sem_domain_code.v(basis))
    return sem_category

def prep_o_domainer(basis, target):
    # makes prep_domain + prep_obj_domain tokens
    prep_obj = E.prep_obj.f(basis)[0]
    prep_o_domain = code2tag(F.sem_domain_code.v(prep_obj))
    return f'{F.lex.v(basis)}_{prep_o_domain}'

def lexer(basis, target):
    # basis tokenizer for simple lexemes
    return F.lex.v(basis)

def prep_o_lexer(basis, target):
    # makes prep_lex + prep_obj_lex token
    prep_obj = E.prep_obj.f(basis)[0]
    return f'{F.lex.v(basis)}_{F.lex.v(prep_obj)}'
    
def nuller(basis, target):
    # basis tokenizer for blank values
    return 'ø'

def functioner(basis, target):
    # function basis tokens
    return F.function.v(basis)

def relationer(basis, target):
    # clause relation basis tokens
    return F.rela.v(basis)

def rela_prep_lexer(basis, target):
    # returns clause relation + prep + verb lex
    rela = F.rela.v(L.u(basis, 'clause')[0])
    prep = next(w for w in L.d(L.u(basis, 'phrase')[0], 'word') if F.pdp.v(w) == 'prep')
    prep_lex = F.lex.v(prep)
    return f'{rela}.{prep_lex}_{F.lex.v(basis)}'

def rela_conj_lexer(basis, target):
    # returns clause relation + conjunction string + verb lex
    rela = F.rela.v(L.u(basis, 'clause')[0])
    conj_phrase = next(ph for ph in L.d(L.u(basis, 'clause')[0], 'phrase') if F.typ.v(ph) == 'CP')
    conj_string = ''.join(F.lex.v(w) for w in L.d(conj_phrase, 'word'))
    return f'{rela}.{conj_string}_{F.lex.v(basis)}'
   
def rela_lexer(basis, target):
    # returns rela + lex
    rela = F.rela.v(L.u(basis, 'clause')[0])
    return f'{rela}.{F.lex.v(basis)}'


'''
Frame Methodology Notes:
Within the frame, every capturable element
must be present. If there is an uncapturable element, 
we must exclude the entire clause. Examples of "uncapturable
elements" are daughter clauses that are verbless without a 
conjunction. It is not possible to condense these down into
a lexical token, as can be done with כאשר + verb, for instance.
Thus, not only these clauses, but also their mothers, must be excluded.

In order to know which clauses should be excluded, we have
to run the whole experiment twice so that every clause relation can be
checked and validated. The first time we run it here in this module.

The second time the queries are run in the Experiment class to produce results.
The results are then crossreferenced against the first run to make sure that all
elligible functions are present in the complete result.

The class validateFrame (below) completes this task. The data is prepared
within the module and is then called to filter the final results.
'''

class validateFrame:
    '''
    This class prepares frame validation data
    and then filters results based on the prepared
    data.
    '''
    
    def __init__(self, mother_templates=tuple(), 
                       daughter_templates=tuple(), 
                       mother_ri = 0,
                       daughter_ri = 3,
                       exp_name = ''):
    
        print(f'Preparing frame validation data for {exp_name}...')

        self.good_mothers = set()
        self.good_daughters = collections.defaultdict(set)
        self.daughter_ri = daughter_ri
        self.mother_ri = mother_ri

        print(f'\tpreparing good mother set...')
        for mom in mother_templates:
            results = set(S.search(mom))
            self.good_mothers |= set(r[mother_ri] for r in results) 

        print(f'\tpreparing good daughter set...')
        for daught in daughter_templates:
            results = set(S.search(daught))
            for r in results:
                rela = F.rela.v(r[daughter_ri])
                self.good_daughters[rela].add(r[daughter_ri])

        print(f'\t√ Frame validation data prep complete.')
    
    def mothers(self, results):
        '''
        Checks both a mother and her daughters
        for validity.
        '''
        check_relas = set(self.good_daughters.keys())
        validated_results = []
        for r in results:
            mother = r[self.mother_ri]
            check_mother_daughters = all([d in self.good_daughters[F.rela.v(d)] for d in E.mother.t(mother)
                                              if F.rela.v(d) in check_relas])
            if mother in self.good_mothers and check_mother_daughters:
                validated_results.append(r)
        return validated_results
                
    def daughters(self, results):
        '''
        Checks daughters for validity.
        '''
        check_relas = set(self.good_daughters.keys())
        validated_results = []
        for r in results:
            if all([d in self.good_daughters[F.rela.v(d)] for d in E.mother.t(r[0]) # NB: Assume mother is i=0
                        if F.rela.v(d) in check_relas]):
                validated_results.append(r)
        return validated_results

'''
The following search templates are specialized for
selecting carefully defined clause relations. These
templates have been crafted to select elements from the 
clauses which can easily be lexicalized as basis strings.
It excludes a small numer of clause relations that cannot 
easily be lexicalized, such as verbless clauses without conjunction
elements (i.e. כאשר)
'''
    
clR_vc_CP = '''

#basis @ 6

c2:clause
    p1:phrase typ=CP
    p2:phrase
    /with/
    clause kind=VC rela={relas} typ#Ptcp
        p3:phrase function=Pred|PreS|PreO
        p3 = p2
    /or/
    clause kind=VC rela={relas} typ=Ptcp
        p3:phrase function=PreC|PtcO
        p3 = p2
    /-/

        basis:word pdp=verb {reqs}

c1 <mother- c2
c2 [[ p2
p1 < p2
'''

clR_vc_prep = '''

#basis @ 6

c2:clause
/without/
    phrase typ=CP
/-/
    p2:phrase
    /with/
    clause kind=VC rela={relas} typ#Ptcp
        p:phrase function=Pred|PreS|PreO
        p = p2
    /or/
    clause kind=VC rela={relas} typ=Ptcp
        p:phrase function=PreC|PtcO
        p = p2
    /-/
    
        word pdp=prep
        < word pdp=verb {reqs} 

c1 <mother- c2
'''

clR_vc_verb = '''

#basis @ 5

c2:clause
/without/
    phrase typ=CP
/-/
/without/
    word pdp=prin|inrg
/-/

    p2:phrase
    
    /with/
    clause kind=VC rela={relas} typ#Ptcp
        p:phrase function=Pred|PreS|PreO
        /without/
            word pdp=prep
        /-/
        p = p2
    /or/
    clause kind=VC rela={relas} typ=Ptcp
        p:phrase function=PreC|PtcO
        /without/
            word pdp=prep
        /-/
        p = p2
    /-/
    
        basis:word pdp=verb {reqs}

c1 <mother- c2
'''

clR_nc_CP = '''
c2:clause kind=NC rela={relas}
    phrase typ=CP
    < phrase function=PreC
        -heads> word pdp#prep|prps|prde|prin|inrg {reqs}

c1 <mother- c2
'''

clR_nc_PreC_adv = '''
#only for use with adj/cmpl relations 

c2:clause kind=NC rela={relas}
/without/
    phrase typ=CP
/-/
    phrase function=PreC typ=AdvP
        -heads> word pdp#prep|prps|prde|prin|inrg {reqs}

c1 <mother- c2
'''

clR_nc_PreC_prep = '''
#only for use with adj/cmpl functions 

c2:clause kind=NC rela={relas}
/without/
    phrase typ=CP
/-/
    phrase function=PreC typ=PP
        -heads> word pdp=prep
        -prep_obj> word pdp#prep|prps|prde|prin|inrg {reqs}

c1 <mother- c2
'''

## Tests

In [106]:
vf_clause_conditions = '''

c2:clause
/without/
    phrase function={relas} typ#NP|PrNP|AdvP|PP
/-/
{clause_reqs}

/where/
    phrase function={relas} typ#PP
/have/
    -heads> word pdp#prep|prps|prde|prin|inrg {word_reqs}
/-/

/where/
    phrase function={relas} typ=PP
/have/
    /where/
        -heads> word pdp=prep
    /have/
        -prep_obj> word pdp#prep|prps|prde|prin|inrg {word_reqs}
    /-/
/-/

c1 = c2
'''



vf_all_arg_conditionsSD = vf_clause_conditions.format(relas='Objc|Cmpl|Adju|Time|Loca|PrAd', 
                                                      word_reqs=f'sem_domain_code~{good_sem_codes}',                                  
                                                      clause_reqs='/without/\n    phrase function=Rela\n/-/')

vf_allarg_sd_np = pred_target.format(basis=f'''

{vf_all_arg_conditionsSD}

    phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ=NP|PrNP|AdvP
        -heads> word
        
''', pred_funct='Pred|PreS')

vf_allarg_sd_pp = pred_target.format(basis=f'''

{vf_all_arg_conditionsSD}

    phrase function=Cmpl|Adju|Time|Loca|PrAd typ=PP
        -heads> word
        -prep_obj> word
        
''', pred_funct='Pred|PreS')

vf_allarg_sd_pp_obj = pred_target.format(basis=f'''

{vf_all_arg_conditionsSD}

    phrase function=Objc typ=PP
        -heads> word
        -prep_obj> word
     
''', pred_funct='Pred|PreS')

# Clause Relations
vf_argsSD_cr_vc_CP = pred_target.format(basis=clR_vc_CP.format(relas='Objc|Cmpl|Adju', 
                                        reqs=f'sem_domain_code~{good_sem_codes}'), 
                                        pred_funct='Pred|PreS')
vf_argsSD_cr_vc_prep = pred_target.format(basis=clR_vc_prep.format(relas='Objc|Cmpl|Adju', 
                                          reqs=f'sem_domain_code~{good_sem_codes}'),
                                          pred_funct='Pred|PreS')
vf_argsSD_cr_vc_verb = pred_target.format(basis=clR_vc_verb.format(relas='Objc|Cmpl|Adju', 
                                          reqs=f'sem_domain_code~{good_sem_codes}'),
                                          pred_funct='Pred|PreS')
vf_argsSD_cr_nc_CP = pred_target.format(basis=clR_nc_CP.format(relas='Objc|Cmpl|Adju', 
                                        reqs=f'sem_domain_code~{good_sem_codes}'),
                                        pred_funct='Pred|PreS')
vf_argsSD_cr_nc_Prec_adv = pred_target.format(basis=clR_nc_PreC_adv.format(relas='Cmpl|Adju',
                                              reqs=f'sem_domain_code~{good_sem_codes}'),
                                              pred_funct='Pred|PreS')
vf_argsSD_cr_nc_Prec_prep = pred_target.format(basis=clR_nc_PreC_prep.format(relas='Cmpl|Adju',
                                              reqs=f'sem_domain_code~{good_sem_codes}'),
                                              pred_funct='Pred|PreS')

# valSD = validateFrame(mother_templates=(vf_allarg_sd_np,
#                                         vf_allarg_sd_pp, 
#                                         vf_allarg_sd_pp_obj),
#                       daughter_templates = (vf_argsSD_cr_vc_CP,
#                                             vf_argsSD_cr_vc_prep, 
#                                             vf_argsSD_cr_vc_verb,
#                                             vf_argsSD_cr_nc_CP,
#                                             vf_argsSD_cr_nc_Prec_adv,
#                                             vf_argsSD_cr_nc_Prec_prep),
#                       exp_name='vf_allarg_sd')

In [107]:
test = B.search(vf_allarg_sd_np)

8526 results


In [104]:
print(vf_allarg_sd_np)



c1:clause
    p1:phrase

    /with/
    clause typ#Ptcp
        p:phrase function=Pred|PreS
            -heads> word pdp=verb language=Hebrew
        p = p1
    /or/
    clause typ=Ptcp
        p:phrase function=PreC|Pred|PreS
            -heads> word pdp=verb language=Hebrew
        p = p1
    /-/

        target:word pdp=verb
    




c2:clause
/without/
    phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ#NP|PrNP|AdvP|PP
/-/
/without/
    phrase function=Rela
/-/

/where/
    phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ#PP
/have/
    -heads> word pdp#prep|prps|prde|prin|inrg sem_domain_code~{good_sem_codes}
/-/

/where/
    phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ=PP
/have/
    /where/
        -heads> word pdp=prep
    /have/
        -prep_obj> word pdp#prep|prps|prde|prin|inrg sem_domain_code~{good_sem_codes}
    /-/
/-/

c1 = c2


    phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ=NP|PrNP|AdvP
        -heads> word
        


lex freq_lex>9
   lexword:word 
   l

In [105]:
S.study(vf_allarg_sd_np)

   |     0.00s Feature overview: 111 for nodes; 6 for edges; 1 configs; 7 computed
  0.00s Checking search template ...
  0.01s Setting up search space for 8 objects ...
   |     0.00s "Quantifier on "p1:phrase"
   |      |   /with/
   |      |   p1:phrase
   |      |   clause typ#Ptcp
   |      |       p:phrase function=Pred|PreS
   |      |           -heads> word pdp=verb language=Hebrew
   |      |       p = p1
   |      |     1.81s adding 57255 to 0 yields 57255 nodes
   |      |   /or/
   |      |   p1:phrase
   |      |   clause typ=Ptcp
   |      |       p:phrase function=PreC|Pred|PreS
   |      |           -heads> word pdp=verb language=Hebrew
   |      |       p = p1/-/
   |      |     1.30s adding 5037 to 57255 yields 62292 nodes
   |     1.29s reduction from 253210 to 62292 nodes
   |     0.00s "Quantifier on "c2:clause"
   |      |   /without/
   |      |   c2:clause
   |      |       phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ#NP|PrNP|AdvP|PP
   |      |   /-/
   | 