## preliminaries

In [2]:
import pandas as pd
from collections import Counter, defaultdict
from scipy.stats import fisher_exact
import numpy as np
from itertools import combinations
from datetime import datetime
import os
from multiprocessing import Pool
from subprocess import call
import networkx as nx

## Globals

In [3]:
doculects = pd.read_csv('./files/language_sample.csv', sep = '\t').language.to_list()
#
MIN_P = 1e-6 # theta_fe in the paper
MIN_D = 3 # theta_bt in the paper
#
core_pos = {'NOUN','ADJ','VERB'}
peri_pos = {'ADP', 'ADV', 'PART', 'PROPN', 'AFX'}

## Run eflomal, Round 1

In [29]:
def run_eflomal(doc, seedtype, targtype='mrph', outdir='./alignments/'):
    #
    fwdname = outdir + doc + (('.' + seedtype) if seedtype != 'txt' else '') + '.fwd'
    revname = outdir + doc + (('.' + seedtype) if seedtype != 'txt' else '') + '.rev'
    symname = outdir + doc + (('.' + seedtype) if seedtype != 'txt' else '') + '.sym'
    if not os.path.isfile(fwdname):
        call(['eflomal-align', '-s', './bitexts/seed.%s' % seedtype, '-t', './bitexts/%s.%s' % (doc, targtype), '--model', '3', 
              '-f', fwdname, '-r', revname])
    with open(symname, 'w') as fout:
        call(['./atools', '-c', 'grow-diag-final-and', '-i', fwdname, '-j', revname], stdout=fout)
    print(doc, datetime.now())

In [None]:
with Pool(10) as p:
    p.starmap(run_eflomal, map(lambda doc : (doc, 'txt', 'mrph', './alignments/'), doculects))

## Step 1: create a circumlexified corpus

### functions

In [6]:
def get_seed_index(sbd, core_words):
    """
    mapping all tokens of core words (N,A,V), represented as line-ix, token-ix pairs to a unique index
    """
    seed_index = {}
    for vi,l in enumerate(sbd):
        for wi,w in enumerate(l):
            if w.split('/')[0] in core_words:
                seed_index[vi,wi] = len(seed_index)
    print(len(seed_index))
    return seed_index

In [7]:
def get_counts_and_builder(doculects, core_words):
    """
    gather alignment counts and uniquely aligned words in target languages given a seed language word
    and a 'builder' object containing all relevant seed languages mapped to their translations in F
    """
    f2e = defaultdict(lambda : defaultdict(lambda : Counter()))
    builder = defaultdict(lambda : set())
    #
    for di,doc in enumerate(doculects):
        if di % 25 == 0: print(di,doc,datetime.now())
        for i,(a,l1,l2) in enumerate(zip(open('./alignments/' + doc + '.sym'),
                                         open('./bitexts/seed.txt'),
                                         open('./bitexts/%s.mrph'% doc))):
            a = [tuple(map(int, ax.split('-'))) for ax in a.strip('\n').split()]
            w1,w2 = l1.strip('\n').split(), l2.strip('\n').split()
            f2e_a,e2f_a = defaultdict(lambda : set()),defaultdict(lambda : set())
            for ae,af in a:
                f2e_a[af].add(ae)
                e2f_a[ae].add(af)
            for ae, AF in e2f_a.items():
                we = w1[ae]
                if we.split('/')[0].lower() in core_words:
                    WF = {(doc,w2[af]) for af in AF if f2e_a[af]-{ae} == set()}
                    # only translations to AF that do not backtranslate to any other word than AE
                    builder[i,ae,we] |= WF
            for ae,af in a:
                we, wf = w1[ae].split('/')[0], w2[af]
                #if we.split('/')[1] in core_pos | peri_pos:
                f2e[doc][wf][we] += 1
    return f2e, builder

In [8]:
def get_tok2bt(builder, bt_max):
    """
    given the translations, retrieve mapping to backtranslations for each target doculect
    """
    tok2bt = {}
    lemct = Counter()
    for i,(k,v) in enumerate(builder.items()):
        if i % 10000 == 0: print(i,len(builder),k, datetime.now())
        lemct[k[2].split('/')[0].lower()] += 1
        bts = defaultdict(lambda : set())
        for h in v:
            #print(h)
            bt = bt_max[h[0]][h[1]]
            if bt != None:
                bts[h[0]].add(bt)
        tok2bt[k] = {t: bt for t,bt in bts.items()}
    return tok2bt, lemct

In [9]:
def get_typ2bt(tok2bt):
    """
    aggregate tok2bt into types
    """
    typ2bt = defaultdict(lambda : defaultdict(lambda : Counter()))
    lgtot = Counter()
    for k,bts in tok2bt.items():
        for lg,bt in bts.items():
            #if k[2].split('/')[0].lower() in bt: continue
            for i in range(len(bt)):
                for bti in combinations(bt,i+1):
                    #if k[2].split('/')[0].lower() in bt and k[2].split('/')[0].lower() not in bti: continue
                    typ2bt[k[2].split('/')[0].lower()][lg][frozenset(bti)] += 1
            lgtot[lg] += 1
    return typ2bt, lgtot

In [10]:
def get_lg2bt(typ2bt):
    lg2bt = {}#defaultdict(lambda : Counter())
    for lem in typ2bt:
        for lg in typ2bt[lem]:
            if lg not in lg2bt: 
                lg2bt[lg] = {}
            for bt,v in typ2bt[lem][lg].items():
                lg2bt[lg][bt] = lg2bt[lg].get(bt, 0) + v
    return lg2bt

In [11]:
def calculate_fisher_exact(a,b,c,d):
    if a/(a+b) < c/(c+d) or a/(a+c) < b/(b+d): 
        return 0
    return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])

In [12]:
def get_candidates(typ2bt, lemct, lg2bt, lgtot, core_words, peri_words):
    good_inputs = [(lem, lg, bt, a) for lem in typ2bt for lg in typ2bt[lem] for bt,a in typ2bt[lem][lg].items()
                   if (len(bt) > 1 and 
                       any(map(lambda e : e.split('/')[0] in core_words, bt)) and
                       all(map(lambda e : e.split('/')[0] in peri_words, bt)))]
    pool_inputs = ((a, lemct[lem]-a, lg2bt[lg][bt]-a, lgtot[lg] - (lemct[lem]+lg2bt[lg][bt]-2*a))
                    for lem, lg, bt, a in good_inputs)
    with Pool(12) as p:
        scores = p.starmap(calculate_fisher_exact, pool_inputs)
    candidates = {}
    for (lem, lg, bt, a), score in zip(good_inputs, scores):
        if score > -np.log(MIN_P):
            candidates[lem,bt] = candidates.get((lem,bt), {}) | {lg:score}
    return candidates

In [13]:
def creates_a_cycle(replace_graph, top):
    replace_graph.add_edges_from(((top[0],ki) for ki in top[1] if top[0] != ki))
    try:
        c = nx.find_cycle(replace_graph)
        replace_graph.remove_edges_from(((top[0],ki) for ki in top[1] if top[0] != ki))
        return replace_graph, True
    except nx.NetworkXNoCycle:
        return replace_graph, False

### gather candidate replacements

In [14]:
#  preliminaries
sbd = [list(filter(lambda x: x.split('/')[1] not in {'PUNCT'}, v.strip('\n').split(' '))) for v in open('./bitexts/seed.txt')]
wct = Counter([w for l in open('./bitexts/seed.txt') for w in l.strip('\n').split()])
# gets lemma/pos pair counts
#
word2pos = defaultdict(lambda : Counter())
for w,c in wct.items():
    word2pos[w.split('/')[0].lower()][w.split('/')[1]] += c
# frequency of PoS per lemma

core_words = {w.split('/')[0].lower() for w,c in wct.items()
              if word2pos[w.split('/')[0].lower()].most_common(1)[0][0] in core_pos}
peri_words = {w.split('/')[0].lower() for w,c in wct.items()
              if word2pos[w.split('/')[0].lower()].most_common(1)[0][0] in core_pos|peri_pos}
# define core and peri(pheral) words on the basis of their dominant PoS
print(len(core_words), len(peri_words))

2497 3328


In [15]:
# run
seed_index = get_seed_index(sbd, core_words)
f2e, builder = get_counts_and_builder(doculects, core_words)
bt_max = {doc : {wf : next((we for we,c in wes.most_common(1)),None)
                 for wf,wes in bts.items()} for doc, bts in f2e.items()}
# most frequent backtranslations per word type per language
tok2bt, lemct = get_tok2bt(builder, bt_max)
typ2bt, lgtot = get_typ2bt(tok2bt)
lg2bt = get_lg2bt(typ2bt)
candidates = get_candidates(typ2bt, lemct, lg2bt, lgtot, core_words, peri_words) 

56613
0 AVAANT 2025-04-19 12:01:44.838440
25 CRHIBT 2025-04-19 12:01:53.264354
50 LEFTBL 2025-04-19 12:02:03.087417
75 NOAWBT 2025-04-19 12:02:12.777504
100 YUZNTM 2025-04-19 12:02:22.825266
125 TACPBC 2025-04-19 12:02:33.009422
150 MLPTBL 2025-04-19 12:02:42.764288
175 ROOWBT 2025-04-19 12:02:52.656363
0 57245 (0, 1, 'book/NOUN') 2025-04-19 12:03:03.198994
10000 57245 (1503, 15, 'tree/NOUN') 2025-04-19 12:03:07.392000
20000 57245 (3077, 17, 'crowd/NOUN') 2025-04-19 12:03:11.193367
30000 57245 (4684, 8, 'shut/VERB') 2025-04-19 12:03:14.063697
40000 57245 (6361, 26, 'day/NOUN') 2025-04-19 12:03:17.248657
50000 57245 (7770, 10, 'earth/NOUN') 2025-04-19 12:03:20.787949


  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
  return -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])


### iterative extraction of replacements

In [None]:
top = max(candidates, key = lambda x : (len(candidates[x]), np.mean(list(candidates[x].values()))), default=None)
# top contains on every iteration the candidate v_s,P pair with the greatest number of languages.
replace_graph = nx.DiGraph()
replacements = {}
fh = open('./files/step1_output.txt','w')
#
while top != None and len(candidates[top]) >= MIN_D:
    #
    replace_graph, creates_cycle = creates_a_cycle(replace_graph, top) # avoids cycles in replacements
    #
    # Retrieve modeled tokens
    tokens_per_lg = {lg : set() for lg in candidates[top]}
    all_tokens_lem = set()
    for t in filter(lambda t : t[2].split('/')[0].lower() == top[0], tok2bt):
        all_tokens_lem.add(t)
        for lg in filter(lambda lg : lg in tok2bt[t] and set(top[1]) <= tok2bt[t][lg], candidates[top]):
            tokens_per_lg[lg].add(t)
    tokens = set.union(*list(tokens_per_lg.values()) + [set()])
    # tokens contains all modeled tokens given the split
    #
    # Writing output
    avg_overlap = np.mean([len(vi&vj)/len(vi|vj) for vi,vj in combinations(tokens_per_lg.values(),2)])
    print(top, len(candidates[top]), datetime.now())
    fh.write('#%d %s %s\tnLg=%d avgAssoc=%.2f coverage=%.2f (N=%d; overlap=%.2f)\ncoverage per lg: %s\n%s\n\n' % 
          (len(replacements), top[0],'+'.join(top[1]), len(candidates[top]),
           np.mean(list(candidates[top].values())), len(tokens)/lemct[top[0]], len(tokens), avg_overlap,
           Counter({lg:len(t) for lg,t in tokens_per_lg.items()}).most_common(), datetime.now()))
    for lg in np.random.choice(list(candidates[top]), size=5):
        W1,W2 = open('./bitexts/seed.txt').readlines(),open('./bitexts/%s.txt'% lg).readlines()     
        for t in list(tokens_per_lg[lg])[:5]:
            fh.write('>> covered    : %s %s\t%s\n%s%s\n' % (lg, t, {w[1]:bt_max[lg][w[1]] for w in builder[t] if w[0] == lg}, W1[t[0]], W2[t[0]]))
        for t in list(all_tokens_lem - tokens_per_lg[lg])[:5]:
            fh.write('>> not covered: %s %s\t%s\n%s%s\n' % (lg,t, {w[1]:bt_max[lg][w[1]] for w in builder[t] if w[0] == lg}, W1[t[0]], W2[t[0]]))
    # If the additions does not contain a cycle, update the replacements dictionary
    if creates_cycle: fh.write('==> creates a cycle\n'); print('==> creates a cycle')
    else: replacements[top] = tokens
    #
    # Part B: Update all counts
    lemct[top[0]] -= len(tokens)
    for m in tokens:
        bts = tok2bt.pop(m)
        for lg,bt in bts.items():
            #if m[2].split('/')[0].lower() in bt: continue
            lgtot[lg] -= 1
            for i in range(len(bt)):
                for bti in combinations(bt, i+1):
                    #if m[2].split('/')[0].lower() in bt and m[2].split('/')[0].lower() not in bti: continue
                    typ2bt[top[0]][lg][frozenset(bti)] -= 1
                    lg2bt[lg][frozenset(bti)] -= 1
    # Part C: Recalculate significance of v_s,P associations
    for cand in list(filter(lambda c : c[0] == top[0], candidates)):
        for lg in list(candidates[cand]):
            a = typ2bt[top[0]][lg][top[1]]
            b = lemct[top[0]] - a
            c = (lg2bt[lg][top[1]] - a) if top[1] in lg2bt[lg] else 0
            d = lgtot[lg] - (a+b+c)
            score = -np.log(fisher_exact([[a,b],[c,d]], alternative='greater')[1])
            if score <= -np.log(MIN_P): candidates[cand].pop(lg)
            else: candidates[cand][lg] = score
        if len(candidates[cand]) == 0: candidates.pop(cand)
    top = max(candidates, key = lambda x : (len(candidates[x]), np.mean(list(candidates[x].values()))), default = None)
fh.close()

### write out replacement corpus

In [70]:
word2reps = {}
for k in replacements:
    word2reps[k[0]] = word2reps.get(k[0], []) + [k[1]]
    
lnew, dnew = [], []
for li,l in enumerate(open('./bitexts/seed.dep')):
    lnew.append([])
    dnew.append([])
    for wi,w in enumerate(l.strip('\n').split()):
        word,pos,ix,hix = w.split('/')
        heads_deps = {wj.split('/')[0] for wj in l.strip('\n').split() if wj != w and 
                      (wj.split('/')[2] == hix or wj.split('/')[3] == ix)}
        #print(w, heads_deps) # block replacements if already a head or dep
        if word in word2reps:
            right_replacement = next((replacement for replacement in word2reps[word]
                                      if (li,wi,word+'/'+pos) in replacements[word, replacement]), None)
            if right_replacement != None:
                replacement_l = [wj + '/' + word2pos[wj.lower()].most_common(1)[0][0] 
                                 for wj in right_replacement if wj not in heads_deps]
            else:
                replacement_l = [ word + '/' + pos ]
        else:
            replacement_l = [ word + '/' + pos ]
        #
        new_deps = list(map(lambda x : '%s/%s/%s' % (x,ix,hix), replacement_l))
        # #print(new_deps)
        lnew[-1].extend(replacement_l)
        dnew[-1].extend(new_deps)
    #print(dnew[-1])
    
with open('./bitexts/seed.splt', 'w') as fout:
    fout.write('\n'.join(' '.join(ln) for ln in lnew))
with open('./bitexts/seed.splt.dep', 'w') as fout:
    fout.write('\n'.join(' '.join(ln) for ln in dnew))   

### generate Step-1 result table for paper

In [72]:
step1_output = [[x.split(' ') for x in l.strip('\n').split('\t')] for l in open('./files/step1_output.txt')
                if l[0] == '#' or l[:3] == '==>']
step1_output = [s for si,s in enumerate(step1_output) if s[0][0] != '==>' and si < len(step1_output)-1 and step1_output[si+1][0][0] != '==>']
df_builder = []
for si,s in enumerate(step1_output):
    if si < 20 or si % 30 == 29:
        df_builder.append({'rank' : int(s[0][0][1:])+1, '$v_s$' : s[0][1], '$P$' : s[0][2], 
                           '$N$ doculects' : int(s[1][0].split('=')[1]),
                           'avg. -log $p$' : float(s[1][1].split('=')[1]), 
                           '$N$ tokens' : int(s[1][3][3:-1]), 
                           'token coverage' : float(s[1][2].split('=')[1])})
ctext = 'Select output of Step 1 (top-20 extractions and every 30th extraction after)'
print(pd.DataFrame(df_builder).to_latex(index=False,caption=ctext,float_format='%.2f'))

\begin{table}
\caption{Select output of Step 1 (top-20 extractions and every 30th extraction after)}
\begin{tabular}{rllrrrr}
\toprule
rank & $v_s$ & $P$ & $N$ doculects & avg. -log $p$ & $N$ tokens & token coverage \\
\midrule
1 & answer & answer+say & 94 & inf & 239 & 0.97 \\
2 & write & write+say & 85 & 82.84 & 161 & 0.76 \\
3 & scribe & law+scribe & 84 & 242.82 & 66 & 1.00 \\
4 & widow & widow+woman & 77 & 57.86 & 27 & 0.96 \\
5 & heal & heal+sick & 64 & 30.67 & 49 & 0.61 \\
6 & vinegar & vinegar+wine & 64 & 30.24 & 6 & 1.00 \\
7 & prostitute & prostitute+woman & 58 & 42.50 & 13 & 1.00 \\
8 & forgive & sin+forgive & 54 & 47.37 & 61 & 0.94 \\
9 & cup & cup+wine & 54 & 28.28 & 18 & 0.55 \\
10 & hear & word+hear & 52 & 110.50 & 348 & 0.87 \\
11 & knock & knock+door & 52 & 28.51 & 9 & 1.00 \\
12 & come & to+come & 49 & 65.89 & 572 & 0.47 \\
13 & life & life+eternal & 48 & 60.16 & 134 & 0.64 \\
14 & repent & sin+repent & 47 & 57.79 & 56 & 0.98 \\
15 & loaf & bread+loaf & 47 & 43.27 & 27

## Run eflomal on the circumlexified data

In [None]:
TARGTYPE = 'mrax'
with Pool(12) as p:
    p.starmap(run_eflomal, map(lambda doc : (doc, 'splt', TARGTYPE, './alignments/'), doculects))

## Step 2: gather synlexifications

### all potential synlexifications

In [17]:
def bad_k(k):
    if k[0] == k[1]: return True
    elif 'AFX' in k[0]+k[1]: return True
    return False

def word_type_check(k, core_words, peri_words):
    return any(map(lambda wq : wq[0].lower() in core_words, k)) and all(map(lambda wq : wq[0].lower() in peri_words, k))  

hits = defaultdict(lambda : set())
ctr = 0 # scaffold
for li,l in enumerate(open('./bitexts/seed.splt.dep').readlines()):
    words = list(map(lambda w : w.split('/'), l.strip('\n').split()))
    ix2word = defaultdict(lambda : list())
    word2align = {tuple(w) : i for i,w in enumerate(words)}
    for word in words:
        ix2word[word[2]] = ix2word.get(word[2], []) + [word]
    #
    for w in words:
        if w[0].lower() in peri_words:
            heads = ix2word[w[3]]
            for head in heads:
                if head == w: continue
                wa,wb = sorted([tuple(w), tuple(head)])
                k = (wa[:2], wb[:2])
                if bad_k(k): 
                    continue
                if word_type_check(k, core_words, peri_words):
                    hits[k].add((li, word2align[wa], word2align[wb]))
                if head[1] == 'ADP':
                    headheads = ix2word[head[3]]
                    for headhead in headheads:
                        wa,wb = sorted([tuple(w), tuple(headhead)])
                        k = (wa[:2], wb[:2])
                        if bad_k(k):
                            continue
                        if word_type_check(k, core_words, peri_words):
                            hits[k].add((li, word2align[wa], word2align[wb]))
    for ix in ix2word:
        for wx,wy in combinations(sorted(ix2word[ix]),2):
            wa,wb = sorted([tuple(wx), tuple(wy)])
            k = (wa[:2], wb[:2])
            if word_type_check(k, core_words, peri_words):
                hits[k].add((li, word2align[wa], word2align[wb]))

In [18]:
verse2syn = {}
for h in hits:
    if len(hits[h]) < 10: continue
    for (v,wi,wj) in hits[h]:
        verse2syn[v] = verse2syn.get(v, {}) | {(wi,wj) : h}

In [19]:
sum(len(v) for v in verse2syn.values())

82559

### get candidate synlexifications

In [21]:
def get_synlexifications(doc, verse2syn, verbose=False, outdir = './characterizations'):
    eset = set()
    ctt = Counter()
    if not os.path.isdir(outdir):
        os.mkdir(outdir)
    #
    # read in
    A = [[tuple(map(int, ax.split('-'))) for ax in a.strip('\n').split()] 
         for a in open('./bitexts/%s.splt.sym' % (TARGTYPE, doc))]
    L1 = [l.strip('\n').split() for l in open('./bitexts/seed.splt')]
    L2 = [l.strip('\n').split() for l in open('./bitexts/%s.%s'% (doc,TARGTYPE)).readlines()]
    #
    # I: gather all
    synlexifications = defaultdict(lambda : {})
    for i,(a,w1,w2) in enumerate(zip(A, L1, L2)):
        e2f_a, f2e_a = defaultdict(lambda : set()), defaultdict(lambda : set())
        for ae,af in a: e2f_a[ae].add(af)
        for ae,af in a: f2e_a[af].add(ae)
        #
        if i not in verse2syn: continue
        for (wi,wj),h in verse2syn[i].items():
            lab = '+'.join(sorted([w1[wi],w1[wj]]))
            t1 = set(map(lambda x : w2[x], e2f_a[wi]))
            t2 = set(map(lambda x : w2[x], e2f_a[wj]))
            bt1 = {w1[x] for y in e2f_a[wi] for x in f2e_a[y] - {wi,wj}}
            bt2 = {w1[x] for y in e2f_a[wi] for x in f2e_a[y] - {wi,wj}}
            t1 = nobt1 = {w2[y] for y in e2f_a[wi] if f2e_a[y] - {wi,wj} == eset}
            t2 = nobt2 = {w2[y] for y in e2f_a[wj] if f2e_a[y] - {wi,wj} == eset}
            #
            category = (4 if t1 & t2 != eset else 
                        (3 if t1 != eset and t2 != eset else
                         (1 if t1 == eset and t2 != eset else (2 if t1 != eset and t2 == eset else 0))))
            if verbose and TARGWRD in lab: 
                print(i, category, len(synlexifications[lab]), lab, t1, t2, bt1, bt2, '\n' + ' '.join(w1) + '\n' + ' '.join(w2) + '\n')
                ctt[tuple(t1),tuple(t2),category] += 1
            synlexifications[lab][(i,(wi,wj))] = (tuple([tuple(t) for t in [t1,t2]]), category)
    #
    syn_builder = []
    #
    for i in sorted(verse2syn):
        w1,w2 = L1[i], L2[i]
        for (wi,wj), h in sorted(verse2syn[i].items()):
            lab = '+'.join(sorted([w1[wi],w1[wj]]))
            (t1,t2), cat = synlexifications[lab][i,(wi,wj)]
            syn_builder.append({'category' : cat, 't1' : t1, 't2' : t2})
    pd.DataFrame(syn_builder).to_excel('%s/%s.xlsx' % (outdir,doc))
    if verbose: print(ctt.most_common(5))
    print(doc, datetime.now())

In [None]:
TARGTYPE = 'mrax'
TARGWRD = 'enter/VERB+in/ADP'
get_synlexifications('BOATBL', verse2syn, True, './results/')

In [None]:
outdir = './results/'
pd.DataFrame([{'verse': v, 'wi':wi, 'wj':wj, 'class': '+'.join(['/'.join(hi) for hi in h])} 
             for v in sorted(verse2syn) for (wi,wj),h in sorted(verse2syn[v].items())]).to_excel(outdir + '/indices.xlsx')
with Pool(4) as p:
    p.starmap(get_synlexifications, ((doc, verse2syn, False, outdir) for doc in doculects))

### evaluation

In [31]:
evaluation = {
    'wife.NOUN_woman.NOUN' : ['BOATBL','EUSNLT','MIQSBN'],
    'clean.ADJ_un.AFX' : ['DJKWBT', 'JAVNRF','CRKWCV'],
    'dead.ADJ_die.VERB' : ['ACMAS3','CHEIBT','INDASV'],
    'king.NOUN_throne.NOUN': ['INDASV','KMSPNG'],
    'whole.ADJ_world.NOUN': ['BVZYSS', 'DTSABM', 'INDASV'],
    'go.VERB_way.NOUN' : ['AAUWBT', 'BOATBL', 'ITAR27','TPIPNG'],
    'to.ADP_world.NOUN' : ['BOATBL', 'DJKWBT', 'EUSNLT'],
    'from.ADP_go.VERB' : ['KHQBIV', 'ITAR27', 'KPVIBT'],
    'go.VERB_out.ADP' : ['EUSNLT', 'ZNEZNE', 'INDASV'],
    'go.VERB_up.ADP' : ['ROOWBT', 'KPVIBT', 'TIHBSM'],
    'take.VERB_way.NOUN': ['DJKWBT', 'KGRLAI', 'KYCPNG', 'JAMBSW'],
    'enter.VERB_in.ADP' : ['FUVLTBL', 'JAMBSW', 'YUJWBT'],
    'door.NOUN_open.VERB' : ['KMOWBT', 'KBHWBT', 'CRNWBT', 'IBATIV'],
    'answer.VERB_say.VERB' : ['TPIPNG', 'BIBWBT', 'IBATIV'],
    'fish.NOUN_net.NOUN' : ['KYCPNG', 'MIQSBN' ],
    'blind.ADJ_eye.NOUN' : ['KGRLAI', 'AAUWBT','CJPTJV','HAKTHV','MDYBSE','MZMWBT','NIJLAI','PRKBSM'],
}
print(len(evaluation))

16


In [29]:
h2i = {}
for h in sorted(verse2syn):
    for t in sorted(verse2syn[h]):
        h2i[h,t] = len(h2i)
print(len(h2i))

82559


In [32]:
cats = ['unlexified', '1-underspecified', '2-underspecified', 'circumlexified', 'synlexified', 'infrequent/error']
for k in evaluation:
    kx = tuple(map(lambda x : tuple(x.split('.')), k.split('_')))
    H = hits[kx]
    print(k,kx,len(H))
    if 'blind' not in k: continue
    L1 = [l.strip('\n').split() for l in open('./bitexts/seed.splt')]
    for doc in evaluation[k]:
        with open('/files/%s_%s.txt' % (k, doc), 'w') as fout:
            tot = Counter()
            dfdoc = pd.read_excel('/files/%s.xlsx' % doc)
            L2 = [l.strip('\n').split() for l in open('./bitexts/%s.%s'% (doc,TARGTYPE)).readlines()]
            for i,(v,wi,wj) in enumerate(H):
                try: fv = {c:dfdoc.iloc[h2i[v,(wi,wj)]][c] for c in ['category', 't1', 't2']}
                except: continue
                kk = str(fv['t1']),str(fv['t2']),str(fv['category']) 
                tot[kk] += 1
                if i % len(H)//15 == 0:
                    fout.write('verse = %s; words = (%d, %d)\nextraction = %s\n' % (v,wi,wj,fv))
                    fout.write(' '.join(L1[v]) + '\n' + ' '.join(L2[v]) + '\n\n')
            fout.write('most common responses: %s\n' % tot.most_common())
            fout.write('most common categories: %s\n' % 
                       {cats[int(c)]:v for c,v in Counter([c[2] for c,v in tot.items() for i in range(v)]).most_common()})
            fout.write('-1 = infrequent/error; 0 = unlexified; 1 = 1-underspecified; 2 = 2-underspecified;\n3 = circumlexified; 4 = synlexified')                 

wife.NOUN_woman.NOUN (('wife', 'NOUN'), ('woman', 'NOUN')) 87
clean.ADJ_un.AFX (('clean', 'ADJ'), ('un', 'AFX')) 46
dead.ADJ_die.VERB (('dead', 'ADJ'), ('die', 'VERB')) 90
king.NOUN_throne.NOUN (('king', 'NOUN'), ('throne', 'NOUN')) 62
whole.ADJ_world.NOUN (('whole', 'ADJ'), ('world', 'NOUN')) 13
go.VERB_way.NOUN (('go', 'VERB'), ('way', 'NOUN')) 112
to.ADP_world.NOUN (('to', 'ADP'), ('world', 'NOUN')) 61
from.ADP_go.VERB (('from', 'ADP'), ('go', 'VERB')) 109
go.VERB_out.ADP (('go', 'VERB'), ('out', 'ADP')) 117
go.VERB_up.ADP (('go', 'VERB'), ('up', 'ADP')) 61
take.VERB_way.NOUN (('take', 'VERB'), ('way', 'NOUN')) 55
enter.VERB_in.ADP (('enter', 'VERB'), ('in', 'ADP')) 191
door.NOUN_open.VERB (('door', 'NOUN'), ('open', 'VERB')) 34
answer.VERB_say.VERB (('answer', 'VERB'), ('say', 'VERB')) 279
fish.NOUN_net.NOUN (('fish', 'NOUN'), ('net', 'NOUN')) 15
blind.ADJ_eye.NOUN (('blind', 'ADJ'), ('eye', 'NOUN')) 58


FileNotFoundError: [Errno 2] No such file or directory: '/files/blind.ADJ_eye.NOUN_KGRLAI.txt'