# 4. Semantic Space Construction

In this NB, I will apply the new `heads` edge feature to extract head nouns from their phrase and record their co-occurring verbs, subjects, objects, and coordinates. Each of these relationships is assigned a weight. Those co-occurrences are then placed into a matrix. Then I assign an associational measure to the counts

In [273]:
import collections
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tf.fabric import Fabric

TF = Fabric(locations='~/github', modules=['etcbc/bhsa/tf/c', 'semantics/tf/c'])
api = TF.load('''
                book chapter verse
                function lex vs language
                pdp freq_lex gloss domain
                voc_lex_utf8
                heads
              ''')
api.makeAvailableIn(globals())

This is Text-Fabric 3.2.2
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

116 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.07s B function             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.11s B lex                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.10s B vs                   from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.10s B language             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.10s B pdp                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.07s B freq_lex             from /Users/cody/github/etcbc/bh

## Gather and Count Noun Relations

Now I will gather nouns from the Hebrew Bible and count syntactic co-occurrences.

In [576]:
# configure weights
path_weights = {'Subj': {'Pred': 1,
                         'Objc': 1
                        },
                'Objc': {
                         'Pred': 1,
                         'Subj': 1
                        },
                'coor': 1
               }

In [578]:
cooccurrences = collections.defaultdict(lambda: collections.Counter()) # noun counts here

# Subj/Objc Counts
for phrase in F.otype.s('phrase'):
    
    # skip non-Hebrew sections
    language = F.language.v(L.d(phrase, 'word')[0]) 
    if language != 'Hebrew':
        continue
    
    # skip non subject/object phrases
    function = F.function.v(phrase)
    if function not in {'Subj', 'Objc'}:
        continue
    
    # get head nouns
    nouns = set(F.lex.v(w) for w in E.heads.f(phrase)) # count lexemes only once
    if not nouns:
        continue
    
    # restrict on frequency
    freq = [F.freq_lex.v(L.u(w, 'lex')[0]) for w in E.heads.f(phrase)]
    if min(freq) < 25:
        continue

    # restrict on proper names
    types = set(F.pdp.v(w) for w in E.heads.f(phrase))
    if {'nmpr'} & types:
        continue

    # restrict on domain
#     if F.domain.v(L.u(phrase, 'clause')[0]) != 'N':
#         continue
    
    # gather contextual data
    clause = L.u(phrase, 'clause')[0]
    good_paths = path_weights[function]
    paths = [phrase for phrase in L.d(clause, 'phrase')
                if F.function.v(phrase) in good_paths.keys()
            ]
    
    # make the counts
    for path in paths:
        
        pfunct = F.function.v(path)
        weight = good_paths[pfunct]
        
        # count for verb
        if pfunct == 'Pred':
            verb = [w for w in L.d(path, 'word') if F.pdp.v(w) == 'verb'][0]
            verb_lex = F.lex.v(verb)
            verb_stem = F.vs.v(verb)
            verb_basis = function + '.' + verb_lex + '.' + verb_stem # with function name added
            if verb and F.lex.v(verb) not in {'HJH['}: # omit "to be" verbs, others?
                for noun in nouns:
                    cooccurrences[noun][verb_basis] += 1
        
        # count for subj/obj
        else:
            conouns = E.heads.f(path)
            cnoun_bases = set(function + '.' + F.lex.v(w) + f'.{pfunct}' for w in conouns) # with function name added
            counts = dict((basis, weight) for basis in cnoun_bases)
            if counts:
                for noun in nouns:
                    cooccurrences[noun].update(counts)
    
    # count coordinates
    for noun in nouns:
        for cnoun in nouns:
            if cnoun != noun:
                cnoun_basis = 'coor.'+cnoun # with coordinate function name
                cooccurrences[noun][cnoun_basis] += path_weights['coor']

cooccurrences = pd.DataFrame(cooccurrences).fillna(0)                

print(len(cooccurrences.columns), 'nouns')
print(len(cooccurrences.index), 'cooccurrences')

616 nouns
3656 cooccurrences


## Apply Association Measure

In [579]:
def safe_log(number):
    '''
    Evaluate for zero before applying log function.
    '''
    if number == 0:
        return 0
    else:
        return math.log(number)
    
def loglikelihood(k, l, m, n, log):
    '''
    Returns the log-likelihood when the supplied elements are given.
    '''
    
    p1 = (k*log(k)) + (l*log(l)) + (m*log(m)) + (n*log(n))        
    p2 = ((k+l)*log(k+l)) - ((k+m)*log(k+m))
    p3 = ((l+n)*log(l+n)) - ((m+n)*log(m+n))
    p4 = ((k+l+m+n))*log(k+l+m+n)
    llikelihood = 2*(p1-p2-p3+p4)
    return llikelihood

def apply_loglikelihood(comatrix):
    
    '''
    Adjusts values in a cooccurrence matrix using log-likelihood. 
    Requires a cooccurrence matrix.
    '''
    new_matrix = comatrix.copy()
    i = 0 
    indent(reset=True)
    info('beginning calculations...')
    indent(1, reset=True)
    for target in comatrix.columns:
        for basis in comatrix.index:
            k = comatrix[target][basis]
            
            if not k:
                i += 1
                if i % 500000 == 0:
                    indent(1)
                    info(f'at iteration {i}')
                continue
            
            l = comatrix.loc[basis].sum() - k
            m = comatrix[target].sum() - k
            n = comatrix.values.sum() - (k+l+m)
            ll = loglikelihood(k, l, m, n, safe_log)
            new_matrix[target][basis] = ll

            i += 1
            if i % 500000 == 0:
                indent(1)
                info(f'at iteration {i}')
    indent(0)
    info(f'FINISHED at iteration {i}')
    return new_matrix

In [580]:
test_ll = apply_loglikelihood(cooccurrences)

  0.00s beginning calculations...
   |     9.72s at iteration 500000
   |       18s at iteration 1000000
   |       27s at iteration 1500000
   |       34s at iteration 2000000
    38s FINISHED at iteration 2252096


## PMI
Pointwise Mutual Information Scores

In [581]:
def apply_pmi(col):
    
    '''
    Apply PMI to a given column.
    '''
    
    expected = col * cooccurrences.sum(axis=1) / cooccurrences.values.sum()
    pmi = np.log(col / expected).fillna(0)
    
    return pmi

test_pmi = cooccurrences.apply(lambda k: apply_pmi(k))
print('done!')

done!


## Calculate Similarities

In [582]:
def cosine_sim(vectA, vectB):
    '''
    Calculate the similarity between a supplied vector A and vector B.
    '''
    new_matrix = comatrix.copy()
    i = 0 
    indent(reset=True)
    info('beginning calculations...')
    indent(1, reset=True)
    
    cos = sum(vectA * vectB) / (math.sqrt(sum(vectA*vectA)) * math.sqrt(sum(vectB*vectB)))
    
    for target in comatrix.columns:
        for basis in comatrix.index:
            pass

In [606]:
test = test_ll

testw = 'CMC/' # test sun

def show_sim(testw, test):
    
    test_sims = []
    vectA = test[testw]

    print(f'Similar nouns to {testw}:\n')

    for word in test.columns:

        if word  == testw:
            continue

        vectB = test[word]

        cosine = sum(vectA * vectB) / (math.sqrt(sum(vectA*vectA)) * math.sqrt(sum(vectB*vectB)))

        test_sims.append((cosine, word))

    for score, word in sorted(test_sims, reverse=True)[:25]:

        lex = [lex for lex in F.otype.s('lex') if F.lex.v(lex) == word][0]

        print([score, word, F.gloss.v(lex), F.freq_lex.v(lex)])
        
show_sim(testw, test)

Similar nouns to CMC/:

[0.4150043639884484, 'JRX=/', 'moon', 27]
[0.3161405227898356, 'QY/', 'end', 67]
[0.29466753668347084, 'KWKB/', 'star', 37]
[0.24312437451112762, 'QYP/', 'anger', 28]
[0.2395651181474568, 'FVN/', 'adversary', 27]
[0.2374447820273706, 'NG</', 'stroke', 78]
[0.21233345826232736, 'LBN/', 'white', 29]
[0.21125236182853413, '>XWR/', 'back(wards)', 41]
[0.21124328795843902, 'CMH/', 'destruction', 41]
[0.21124328795843902, '<WD/', 'duration', 490]
[0.20947448300570387, '>WR/', 'light', 115]
[0.19948636735047134, 'GDWD/', 'band', 33]
[0.19919327844060208, 'MF>/', 'burden', 44]
[0.1921868436569057, 'TWDH/', 'thanksgiving', 32]
[0.1903742354218136, 'MWPT/', 'sign', 36]
[0.17972525000054967, 'JHWDJ/', 'Jewish', 82]
[0.17161640991552882, 'LB/', 'heart', 601]
[0.17139924370651088, '<CN/', 'smoke', 25]
[0.17106615836304698, 'ML>K/', 'messenger', 213]
[0.16834566202038767, '>WT/', 'sign', 79]
[0.16004780497575885, 'XRWN/', 'anger', 41]
[0.15921924527229037, 'JC</', 'help', 36]

In [607]:
show_sim(testw, test_pmi)

Similar nouns to CMC/:

[0.4179322089361344, 'JRX=/', 'moon', 27]
[0.23525002406994303, 'KWKB/', 'star', 37]
[0.17617422385425718, '>WR/', 'light', 115]
[0.17206312358043505, 'QY/', 'end', 67]
[0.15512414634439806, 'NG</', 'stroke', 78]
[0.1483442427192811, 'FVN/', 'adversary', 27]
[0.14445147965146882, 'LBN/', 'white', 29]
[0.1291546158113168, '>XWR/', 'back(wards)', 41]
[0.1290635165594666, 'CMH/', 'destruction', 41]
[0.1290635165594666, '<WD/', 'duration', 490]
[0.10970210464824995, 'TWDH/', 'thanksgiving', 32]
[0.10800930917389484, '<FR/', '-teen', 203]
[0.10529720026553621, 'LBB/', 'heart', 252]
[0.10187635307751156, 'XRWN/', 'anger', 41]
[0.10165501228030492, 'QYP/', 'anger', 28]
[0.09886326158042938, 'JWM/', 'day', 2304]
[0.0978175375382063, 'LB/', 'heart', 601]
[0.09346768755395984, 'CW>/', 'vanity', 53]
[0.09270785479141942, 'NXL/', 'wadi', 139]
[0.09263273402933128, 'KZB/', 'lie', 31]
[0.09095904311726694, 'MWPT/', 'sign', 36]
[0.08915226805645277, 'JHWDJ/', 'Jewish', 82]
[0.