# Evaluating Adjunct, Complement, and Object Clause Relation Selection Procedures

In [117]:
import collections, os, sys, random, re
from tf.fabric import Fabric
from tf.extra.bhsa import Bhsa
os.sys.path.append('..')
from experiments2 import Experiment
from semspace import SemSpace

bhsa_data_paths=['~/github/etcbc/bhsa/tf/c',
                 '~/github/verb_semantics/project_code/lingo/heads/tf/c',
                 '~/github/verb_semantics/project_code/sdbh']

TF = Fabric(bhsa_data_paths)
tf_api = TF.load('''
                function lex vs language
                pdp freq_lex gloss domain ls
                mother rela typ sp st code txt
                heads prep_obj
                prs prs_gn prs_nu prs_ps
                sem_domain sem_domain_code
              ''', silent=True)

tf_api.makeAvailableIn(globals())
B = Bhsa(api=tf_api, name='', version='c')

This is Text-Fabric 4.3.0
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

118 features found and 0 ignored


**Documentation:** <a target="_blank" href="https://etcbc.github.io/bhsa" title="{provenance of this corpus}">BHSA</a> <a target="_blank" href="https://etcbc.github.io/bhsa/features/hebrew/c/0_home.html" title="{CORPUS} feature documentation">Feature docs</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/Bhsa/" title="BHSA API documentation">BHSA API</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/" title="text-fabric-api">Text-Fabric API 4.3.0</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/#search-templates" title="Search Templates Introduction and Reference">Search Reference</a>


This notebook online:
<a target="_blank" href="http://nbviewer.jupyter.org/github/verb_semantics/project_code/blob/master/datareview/.ipynb">NBViewer</a>
<a target="_blank" href="https://github.com/verb_semantics/project_code/blob/master/datareview/.ipynb">GitHub</a>


In [4]:
test = B.search('''

#here is a test note

clause
    phrase function=Pred

''')

57070 results


In [6]:
# standard predicate target template
pred_target = '''
c1:clause
    phrase function={pred_funct}
        target:word pdp=verb language=Hebrew

{basis}

lex freq_lex>9
   lexword:word 
   lexword = target
'''


vf_allarg_clRela_CmpAdju_prep = pred_target.format(basis='''

c2:clause rela=Cmpl|Adju kind=VC
    p1:phrase typ=VP
        w1:word pdp=prep
        w2:word pdp=verb
    
c2 -mother> c1
c2 =: p1
p1 -heads> w2
w1 <: w2
''', pred_funct='Pred|PreS')

vf_allarg_clRela_CmpAdju_verb = pred_target.format(basis='''

c2:clause rela=Cmpl|Adju kind=VC
    p1:phrase typ=VP
        w1:word pdp=verb
    
c2 -mother> c1
p1 -heads> w1
p1 =: w1
p1 =: c2
''', pred_funct='Pred|PreS')

vf_allarg_clRela_CmpAdju_conj = pred_target.format(basis='''

c2:clause rela=Cmpl|Adju kind=VC
    phrase typ=CP
    < phrase typ=VP
        -heads> word pdp=verb
    
c2 -mother> c1
''', pred_funct='Pred|PreS')

vf_allarg_clRela_Objc_prep = pred_target.format(basis='''

c2:clause rela=Objc kind=VC
    p1:phrase typ=VP
        w1:word pdp=prep
        w2:word pdp=verb
    
c2 -mother> c1
c2 =: p1
p1 -heads> w2
w1 <: w2
''', pred_funct='Pred|PreS')

vf_allarg_clRela_Objc_verb = pred_target.format(basis='''

c2:clause rela=Objc kind=VC
    p1:phrase typ=VP
        w1:word pdp=verb
    
c2 -mother> c1
c2 =: p1
p1 -heads> w1
p1 =: w1
''', pred_funct='Pred|PreS')

vf_allarg_clRela_Objc_VC_conj = pred_target.format(basis='''

c2:clause rela=Objc kind=VC
    phrase typ=CP
    < phrase typ=VP
        -heads> word pdp=verb

c2 -mother> c1
''', pred_funct='Pred|PreS')

vf_allarg_clRela_Objc_NC_conj = pred_target.format(basis='''

c2:clause rela=Objc kind~^(?!VC)
    phrase typ=CP
    < phrase function=PreC
        -heads> word

c2 -mother> c1
''', pred_funct='Pred|PreS')

In [4]:
adjcmpl_patterns=\
(vf_allarg_clRela_CmpAdju_prep,
vf_allarg_clRela_CmpAdju_verb,
vf_allarg_clRela_CmpAdju_conj)

obj_patterns=\
(vf_allarg_clRela_Objc_prep,
vf_allarg_clRela_Objc_verb,
vf_allarg_clRela_Objc_VC_conj,
vf_allarg_clRela_Objc_NC_conj)

adjcmpl_clauses = set()
obj_clauses = set()


for pattern in adjcmpl_patterns:
    results = S.search(pattern)
    results = [r[0] for r in results]
    adjcmpl_clauses |= set(results)
    
for pattern in obj_patterns:
    results = S.search(pattern)
    results = [r[0] for r in results]
    obj_clauses |= set(results)
    
print(f'{len(adjcmpl_clauses)} adjcmpl clauses selected...')
print(f'{len(obj_clauses)} obj clauses selected...')

4744 adjcmpl clauses selected...
985 obj clauses selected...


In [5]:
all_adjcmpl = set(r[0] for r in S.search(pred_target.format(basis='''

c2:clause rela=Cmpl|Adju

c1 <mother- c2
''', pred_funct='Pred|PreS')))

all_obj = set(r[0] for r in S.search(pred_target.format(basis='''

c2:clause rela=Objc

c1 <mother- c2
''', pred_funct='Pred|PreS')))

print(f'All {len(all_adjcmpl)} adjcmpl clauses selected...')
print(f'All {len(all_obj)} obj clauses selected...')

All 5071 adjcmpl clauses selected...
All 1168 obj clauses selected...


In [6]:
no_ac = all_adjcmpl - adjcmpl_clauses
no_obj = all_obj - obj_clauses

print(len(no_ac), 'non matched adjunct clauses')
print(len(no_obj), 'non matched object clauses')

327 non matched adjunct clauses
183 non matched object clauses


In [16]:
# make counts of all the clause types

no_ac_kinds = collections.Counter()
no_obj_kinds = collections.Counter()
ac_kinds = collections.Counter()
obj_kinds = collections.Counter()

for group, clauses, relas in ((no_ac_kinds, no_ac, {'Adju', 'Cmpl'}), 
                              (no_obj_kinds, no_obj, {'Objc'}),
                              (ac_kinds, adjcmpl_clauses, {'Adju', 'Cmpl'}),
                              (obj_kinds, obj_clauses, {'Objc'})
                             ):
    for mo_clause in clauses:
        for clause in [d for d in E.mother.t(mo_clause) if F.rela.v(d) in relas]:      
            phrases = '|'.join(F.function.v(ph) for ph in L.d(clause, 'phrase'))
            clause_type = F.typ.v(clause)
            name = f'{clause_type}\t{phrases}'
            group[name] += 1

In [43]:
# for k in sorted(no_ac_kinds.keys()):
#     print(k)

NameError: name 'no_ac_kinds' is not defined

In [38]:
# for k in sorted(no_obj_kinds.keys()):
#     print(k)

In [36]:
# for k in sorted(ac_kinds.keys()):
#     print(k)

In [37]:
# for k in sorted(obj_kinds.keys()):
#     print(k)

## New Tests

In [67]:
not_exist_CP = set(cl for cl in F.otype.s('clause') if 'CP' not in set(F.typ.v(ph) for ph in L.d(cl, 'phrase')))
not_exist_PreC = set(cl for cl in F.otype.s('clause') if not {'PreC', 'Subj'} & set(F.function.v(ph) for ph in L.d(cl, 'phrase')))

verbless = '''

c1:clause
    phrase function=Pred|PreS|PreO|PtcO

c2:clauseNoPreC typ~^(?!Ellp|CPen|Reop) kind~^(?!VC) rela=Adju|Cmpl|Objc

c1 <mother- c2
'''

verbless = B.search(verbless, sets={'clauseNoCP': not_exist_CP, 'clauseNoPreC': not_exist_PreC})

14 results


In [68]:
B.show(verbless[:10])


##### Verse 1



##### Verse 2



##### Verse 3



##### Verse 4



##### Verse 5



##### Verse 6



##### Verse 7



##### Verse 8



##### Verse 9



##### Verse 10


In [36]:
VP_no_prep = set(ph for ph in F.otype.s('phrase') if F.typ.v(ph) == 'VP' and 'prep' not in set(F.pdp.v(w) for w in L.d(ph, 'word')))

adj_verb_noCP = '''

c1:clause
    phrase function=Pred|PreS|PreO|PtcO

c2:clauseNoCP kind=VC rela=Cmpl
    VP_prepless

c1 <mother- c2
'''

bare_verb = B.search(adj_verb_noCP, sets={'clauseNoCP': not_exist_CP, 'VP_prepless': VP_no_prep})

6 results


In [93]:
search = '''

c1:clause
    phrase function=Pred|PreS|PreO|PtcO

c2:clauseNoCP kind=NC rela=Cmpl|Adju|Objc
    phrase function=PreC

c1 <mother- c2
'''


search = B.search(search, sets={'clauseNoCP': not_exist_CP})

94 results


In [56]:
B.show(search)


##### Verse 1



##### Verse 2



##### Verse 3



##### Verse 4



##### Verse 5



##### Verse 6



##### Verse 7



##### Verse 8



##### Verse 9



##### Verse 10



##### Verse 11


# New Patterns

Here are some new patterns that handle all clause relation types.

In [258]:
def filterPreC(results):
    '''
    Filters results with predicate complement basis
    from clause types that are not participial.
    '''
    new_results = [r for r in results
                       if not all([re.search('^(?!Ptcp)', F.typ.v(r[3])), F.function.v(r[5]) == 'PreC'])]
    return new_results
    
    
sets = {'clauseNoCP' : set(cl for cl in F.otype.s('clause') if 'CP' not in set(F.typ.v(ph) for ph in L.d(cl, 'phrase'))),
        'phraseNoPrep': set(ph for ph in F.otype.s('phrase') if 'prep' not in set(F.pdp.v(w) for w in L.d(ph, 'word')))}
    
clR_vc_CP = pred_target.format(basis='''

c2:clause kind=VC rela=Objc|Cmpl|Adju
    phrase typ=CP
    < phrase function=Pred|PreS|PreO|PtcO|PreC
        word pdp=verb

c1 <mother- c2
''', pred_funct='Pred|PreS')

clR_vc_prep = pred_target.format(basis='''

c2:clauseNoCP kind=VC rela=Objc|Cmpl|Adju
    phrase function=Pred|PreS|PreO|PtcO|PreC
        word pdp=prep
        word pdp=verb

c1 <mother- c2

''', pred_funct='Pred|PreS')

clR_vc_verb = pred_target.format(basis='''

c2:clauseNoCP kind=VC rela=Objc|Cmpl|Adju
    phraseNoPrep function=Pred|PreS|PreO|PtcO|PreC
        word pdp=verb

c1 <mother- c2

''', pred_funct='Pred|PreS')

clR_nc_CP = pred_target.format(basis='''

c2:clause kind=NC rela=Objc|Cmpl|Adju
    phrase typ=CP
    < phrase function=PreC
        -heads> word

c1 <mother- c2

''', pred_funct='Pred|PreS')

clR_nc_CP = pred_target.format(basis='''

c2:clause kind=NC rela=Objc|Cmpl|Adju
    phrase typ=CP
    < phrase function=PreC
        -heads> word

c1 <mother- c2

''', pred_funct='Pred|PreS')

# ADJU/CMPL ONLY FROM THIS POINT ON
clR_nc_PreC_adv = pred_target.format(basis='''

c2:clauseNoCP kind=NC rela=Adju|Cmpl
    phrase function=PreC typ=AdvP
        -heads> word

c1 <mother- c2

''', pred_funct='Pred|PreS')

clR_nc_PreC_prep = pred_target.format(basis='''

c2:clauseNoCP kind=NC rela=Adju|Cmpl
    phrase function=PreC typ=PP
        -heads> word pdp=prep
        -prep_obj> word

c1 <mother- c2

''', pred_funct='Pred|PreS')

In [252]:
patterns = ((clR_vc_CP, filterPreC),
            (clR_vc_prep, filterPreC),
            (clR_vc_verb, filterPreC),
            (clR_nc_CP, None),
            (clR_nc_PreC_adv, None),
            (clR_nc_PreC_prep, None))

covered_clauses = set()

for pattern, filt in patterns:
    results = [r for r in S.search(pattern, sets=sets)]
    results = results if not filt else filt(results)
    covered_clauses |= set(r[0] for r in results)
    
len(covered_clauses)

5899

In [253]:
all_others = pred_target.format(basis='''

c2:clause rela=Objc|Adju|Cmpl typ~^(?!Ellp|CPen)

c1 <mother- c2
''', pred_funct='Pred|PreS')

all_others = list(r for r in S.search(all_others)
                    if r[0] not in covered_clauses)

len(all_others)

92

There are only 92 cases not covered by the new parameters, when instances of ellipses and causus pendens are excluded.

In [257]:
len([r for r in all_others if {'prin', 'inrg'} & set(F.sp.v(w) for w in L.d(r[3], 'word'))])

41

41 of these 92 cases are instances of interrogatives. Many others are object clauses that are complete sentences and are thus not conducive for simplifying.

### Check for Clauses with Two Relas in the Dataset

In [5]:
two_rela = pred_target.format(basis='''

c2:clause rela=Adju
c3:clause rela=Adju

c1 <mother- c2
c1 <mother- c3
c2 # c3
''', pred_funct='Pred|PreS')

two_rela = B.search(two_rela)

288 results


In [6]:
B.show(two_rela[:5])


##### Passage 1



##### Passage 2



##### Passage 3



##### Passage 4



##### Passage 5


In [266]:
len(covered_clauses)

5899

In [267]:
test = (1, 2, 3, 4)

[i in test for i in range(1, 10)]

[True, True, True, True, False, False, False, False, False]

In [8]:
code_priorities = (('1\.001001[0-9]*',  # ANIMATE
                   '1\.00300100[3,6]', 
                   '1\.00300101[0,3]',
                   '2\.075[0-9]*'),

                  ('1\.00100[2-6][0-9]*',  # INANIMATE
                   '1\.00300100[1-2, 4, 7-9]',
                   '1\.00300101[1-2]',
                   '1\.00[1,3]$',
                   '1\.003001', 
                   '1\.003001005', # names of groups (!)
                   '2\.[0-9]*'), # frames
    
                  ('1\.002[1-9]*', # EVENTS
                   '1\.003002[1-9]*',
                   '1\.002$'))

import re

In [27]:
code = '1.001001009|3.0010001'

match = re.findall('|'.join(code_priorities[0]), code)[0]

code_index = code.split('|').index(match)

In [28]:
code_index

0

## Quantifier Experiments

Exclude clauses with a phrase typ CP

In [4]:
test = '''

clause
no:
    ^ phrase typ=CP
end:

'''

test = B.search(test)

36249 results


Compared with hand coded result...

In [5]:
len(list(cl for cl in F.otype.s('clause') if 'CP' not in set(F.typ.v(ph) for ph in L.d(cl, 'phrase'))))

36249

Great!

Now with the default target template.

In [10]:
test = pred_target.format(basis='''

c2:clause kind=VC rela=Adju|Cmpl|Objc
no:
   ^ phrase typ=CP
end:
    phrase function=Pred|PreS|PreO|PtcO|PreC
        word pdp=prep
        basis:word pdp=verb

c1 <mother- c2
''', pred_funct='Pred|PreS')

test = B.search(test)

4353 results


Compare with the old template...

In [86]:
RelaSets = {'clauseNoCP' : set(cl for cl in F.otype.s('clause') if 'CP' not in set(F.typ.v(ph) for ph in L.d(cl, 'phrase'))),
            'phraseNoPrep': set(ph for ph in F.otype.s('phrase') if 'prep' not in set(F.pdp.v(w) for w in L.d(ph, 'word')))}

In [14]:
test2 = pred_target.format(basis='''

c2:clauseNoCP kind=VC rela=Adju|Cmpl|Objc
    phrase function=Pred|PreS|PreO|PtcO|PreC
        word pdp=prep
        basis:word pdp=verb

c1 <mother- c2''', pred_funct='Pred|PreS')

test2 = B.search(test2, sets=RelaSets)

4353 results


It works! Excellent!

Can it solve my old PreC problem? A PreC phrase should only be selected in the case of participial clauses. How to code this with quantifiers? 

In [105]:
test = '''

s1:sentence
    c1:clause
    c2:clause 
    no:
       ^ phrase typ=CP
    end:
    p2:phrase
    
    either:
        clause kind=VC rela=Adju|Cmpl|Objc typ#Ptcp
            p3:phrase function=Pred|PreS|PreO
            p3 = p2
    or:
        clause kind=VC rela=Adju|Cmpl|Objc typ=Ptcp
            p3:phrase function=PreC|PtcO
            p3 = p2
    end:
    
        -heads> word pdp=verb

c1 <mother- c2
c2 [[ p2
'''

test = B.search(test)

5868 results


In [98]:
#B.show(test[:6])

In [99]:
#B.show([r for r in test if F.function.v(r[3]) == 'PreC'][:5])

Let's compare the results to the old solution...

In [100]:
def filterPreC(results):
    '''
    Filters results with predicate complement basis
    from clause types that are not participial.
    '''
    new_results = [r for r in results
                       if not all([re.search('^(?!Ptcp)', F.typ.v(r[1])), F.function.v(r[2]) == 'PreC'])]
    return new_results

test_old = '''

c1:clause
c2:clauseNoCP kind=VC rela=Adju|Cmpl|Objc
    phrase function=Pred|PreS|PreO|PtcO|PreC
        -heads> basis:word 

c1 <mother- c2

'''

test_old = S.search(test_old, sets=RelaSets)
test_old = filterPreC(test_old)

len(test_old)

5868

In [104]:
set(r[1] for r in test_old) == set(r[2] for r in test)

True

The results are equivalent! While the first template is longer and more complicated, it avoids the ambiguity and unclarity of requiring sets and post-processing functions defined elsewhere. It allows the experiment to be completely operationalized in one statement.

### Further Tests

In [280]:
pred_target = '''
c1:clause
    phrase function={pred_funct}
        target:word pdp=verb language=Hebrew

{basis}

lex freq_lex>9
   lexword:word 
   lexword = target
'''

'''
    p1:phrase
    either:
        p2:phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ#PP
        p1 = p2
    or:
        p2:phrase function=Objc typ=PP
        p1 = p2
    end:
'''


test = B.search(pred_target.format(basis='''
        
c3:clause
    either:
        c2:clause
            phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ#PP
        c2 = c3
    or:
        c2:clause
            phrase function=Objc typ=PP
        c2 = c3
    end:
    
    
c3 = c1
''', pred_funct='Pred|PreS'))

20099 results


In [281]:
test2 = B.search(pred_target.format(basis='''
        
    p1:phrase
    either:
        p2:phrase function=Objc|Cmpl|Adju|Time|Loca|PrAd typ#PP
        p1 = p2
    or:
        p2:phrase function=Objc typ=PP
        p1 = p2
    end:

''', pred_funct='Pred|PreS'))

21475 results


In [282]:
set(r[0] for r in test2) - set(r[0] for r in test) 

set()