## libraries and globals

In [1]:
import dill as pickle
import csv
import re
import numpy as np
import networkx as nx
# #
from scipy.stats import fisher_exact
from scipy.sparse import csr_matrix
from datetime import datetime
from collections import Counter, defaultdict
from unidecode import unidecode
from editdistance import eval as ed
from itertools import chain, combinations
from multiprocessing import Pool
import json

In [2]:
# BITEXT FUNCTIONS
good_pos = {'NOUN','ADJ','VERB'}
excluded_lemmas = {'be', 'other', 'have', 'let', 'one', 'lot', 'same', 'such', 't', 's'}

## Liu et al reimplementation

In [3]:
## fragment extraction

def get_fragments(w, min_len=2, max_len=7, yield_self=True):
    """
    takes a string *w* and extracts all substrings of minimal length *min_len* and maximal length *max_len*
    """
    if yield_self and len(w) > max_len: yield w
    for i in range(len(w)):
        for j in range(i+min_len, min(i+max_len+1,len(w)+1)):
            if (not (i == 0 or j == len(w))) or (j-i) >= min_len+1:
                yield w[i:j]

def get_all_fragments(F, split_words = True, frequency_threshold = 1):
    """
    given a document stored in a dictionary *tbd*, mapping an identifier key 
    onto a string containing the text, and a set of *all_verses*
    (the shared identifier keys between tbd and the source document(,
    this function returns a sparse matrix *fragments* of identifier key (rows) 
    to substrings of the text (column), with the matrix being True if the fragment
    occurs for that identifier key and False otherwise.
    as well as dictionaries for the identification
    of the rows and columns.
    (memory/computation efficient format, but a bit densely written)
    """
    wordcount = Counter((w for l in F for w in l))
    if split_words: 
        word_fragments = {w : set(get_fragments('^%s$' % unidecode(w).lower())) for w in wordcount.keys() }
    else: 
        word_fragments = {w : {unidecode(w).lower()} for w in wordcount.keys() }
    
    fragment_count = Counter((f for w,F in word_fragments.items() for f in F if f != '' for i in range(wordcount[w])))
    fragment_ixx = {f:i for i,(f,c) in enumerate(fragment_count.most_common()) if c >= frequency_threshold }
    #
    R,C = [], []
    for line_ix, line_f in enumerate(F):
        if len(line_f) == 0: continue
        line_frags = list(map(lambda f : fragment_ixx[f],
                              filter(lambda f : f in fragment_ixx,
                                     set.union(*map(lambda w : word_fragments[w], line_f)))))
        R.extend([line_ix]*len(line_frags))
        C.extend(line_frags)
    fragments = csr_matrix((np.ones(len(R)), (R,C)), dtype=bool, shape = (len(F),max(C)+1))
    return fragments, fragment_ixx

In [4]:
def create_alignments(doc, dataset, min_freq = 1, verbose=False):
    #
    E,F = zip(*[([w.split('/')[2] for w in l.strip('\n').split(' ||| ')[1].split() 
                  if w.count('/') >= 3 and w.split('/')[3] in good_pos and w.split('/')[2] not in excluded_lemmas],
                  l.strip('\n').split(' ||| ')[0].split())
                 for l in open('./generated/%s_bitexts/%s.spc' % (dataset,doc))])
    e_fragments, e_dic = get_all_fragments(E, split_words=False, frequency_threshold=1)
    e_counts = Counter([unidecode(e).lower() for l in E for e in l if e != ''])
    e_seed = {e : e_dic[e] for e in e_counts if e_counts[e] >= min_freq and e != ''}
    f_fragments, f_dic = get_all_fragments(F, split_words=True, frequency_threshold=min_freq)
    f_list = np.array(sorted(f_dic, key = lambda k : f_dic[k]))
    #
    # get TEs
    tes, te_words = {}, {}
    for erank,(e,ei) in enumerate(sorted(e_seed.items(), key = lambda x : -e_counts[x[0]])):
        pos = e_fragments[:,ei].nonzero()[0]
        tes[e] = extract_tes(pos, f_fragments, f_dic, coverage=.95, min_trans=0.01, min_backtrans=0.10)
        #tes[e] = merge_similar_tes(tes[e])
        te_words[e] = { te : get_te_words(f_fragments, f_list, te, te_pos) for te,te_pos in tes[e].items() }
        if verbose: print(doc, e, '%d/%d' % (erank,len(e_seed)), e_counts[e], datetime.now(), 
                          Counter({t:len(p) for t,p in tes[e].items()}).most_common(3))
    print(doc, datetime.now(), len(tes))
    with open('./generated/%s_output/%s.json' % (dataset,doc),'w') as fout:
        fout.write(json.dumps({'tes' : tes, 'te_words' : te_words}))

In [5]:
def longest_nonoverlapping(m, rev, frags):
    frags = [rev[f] for f in frags]
    Ma = [mi for mi in frags if m in mi and 
          next((False for mj in frags if mj != mi and mi in mj),True)]
    return Ma

def get_te_words(fragments_F, rev, te, te_pos):
    ctr = Counter()
    for pos in te_pos:
        frags = fragments_F[pos].nonzero()[1]
        Ma = longest_nonoverlapping(te, rev, frags)
        for m in Ma:
            ctr[m] += 1
    return ctr

In [6]:
def extract_tes(pos_ixx, fragments, fixx, coverage = 0.95, min_trans = 0.05, min_backtrans = 0.25, verbose=False):
    """
    implements one forward pass of the Liu et al. (2023) approach.
    Takes a list *pos* of positive instances (row identifiers for fragments)
    As well as the sparse matrix *fragments* and the two dictionaries
    (vixx -- verse_ixx and fixx -- fragment_ixx).
    The parameters determine the fragments considered in the extraction: the iteration keeps going until
    either no good fragments can be found (significance under .001) or the *coverage* has been reached.
    *min_trans* is a float [0,1] that filters out all fragments that occur in hit verses in less than min_trans
    proportion of all hit verses.
    """
    neg_ixx = list(set(range(fragments.shape[0]))-set(pos_ixx))
    flist = [None] * len(fixx)
    for k,v in fixx.items():
        flist[v] = k
    #sorted(fixx, key = lambda k : fixx[k])
    #
    pos_tot_orig = pos_tot = len(pos_ixx)
    neg_tot = len(neg_ixx)
    #
    pos_ct = fragments[pos_ixx].sum(0).A[0]
    neg_ct = fragments[neg_ixx].sum(0).A[0]
    #
    good_fragments = np.where((pos_ct >= 1) & ((pos_ct/pos_tot_orig) >= min_trans) &
                             (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
    string_props = [(f[0] == '^', f[-1] == '$', len(f)) for f in flist]
                    #len(re.sub('[.*]', '', f)))
    #
    hits = defaultdict(lambda : [])
    ct = 0
    fe_scores = {}
    while len(pos_ixx) >= (1-coverage) * pos_tot_orig:
        ct += 1
        #
        # GET BEST
        assoc_scores = Counter()
        for f in good_fragments:
            table = ((pos_ct[f],pos_tot-pos_ct[f]),(neg_ct[f],neg_tot-neg_ct[f]))
            try: fe_score = fe_scores[table]
            except KeyError: fe_score = fe_scores[table] = -np.log(fisher_exact(table, alternative='greater')[1])
            assoc_scores[f] = (fe_score, string_props[f])
        best, best_score = next((x for x in assoc_scores.most_common(1)),(None,None))
        if best == None or best_score[0] < -np.log(5e-2):
            break
        # print([flist[k] for k,v in assoc_scores.most_common(10)])
        #
        # UPDATE
        new_pos_ixx = []
        for pos_v in pos_ixx:
            if fragments[pos_v,best] > 0: hits[flist[best]].append(int(pos_v))
            else: new_pos_ixx.append(pos_v)
        #
        pos_ixx = new_pos_ixx
        pos_tot = len(pos_ixx)
        pos_ct = fragments[pos_ixx].sum(0).A[0]
        neg_ct = fragments[neg_ixx].sum(0).A[0]
        #
        good_fragments = np.where((pos_ct >= 1) & (pos_ct/pos_tot_orig >= min_trans) &
                                    (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
        if verbose: print(ct, flist[best])
    return hits

### execution

In [69]:
with Pool(8) as p:
    p.starmap(create_alignments, map(lambda doc : (doc.strip('\n'), 'doreco', 1), open('./doreco_english_doculects.txt')))

  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]
  (pos_ct/(pos_ct+neg_ct) >= min_backtrans))[0]


hoch1243 2025-06-14 13:52:56.153782 655
jeha1242 2025-06-14 13:53:04.254245 514
jeju1234 2025-06-14 13:53:05.155374 963


  except KeyError: fe_score = fe_scores[table] = -np.log(fisher_exact(table, alternative='greater')[1])


cash1254 2025-06-14 13:53:06.930712 976
anal1239 2025-06-14 13:53:08.441910 1189
goro1270 2025-06-14 13:53:16.247044 1115
even1259 2025-06-14 13:53:19.961585 1038
apah1238 2025-06-14 13:53:22.238760 670
arap1274 2025-06-14 13:53:32.069427 1056
guri1247 2025-06-14 13:53:36.514740 724
goem1240 2025-06-14 13:53:38.035993 989


  except KeyError: fe_score = fe_scores[table] = -np.log(fisher_exact(table, alternative='greater')[1])


beja1238 2025-06-14 13:53:51.538532 1317
nngg1234 2025-06-14 13:54:05.580415 1054
kaka1265 2025-06-14 13:54:05.957231 1804
movi1243 2025-06-14 13:54:07.018497 1088
cabe1245 2025-06-14 13:54:13.244728 813
ngal1292 2025-06-14 13:54:17.488503 491
dolg1241 2025-06-14 13:54:20.102199 1389
nort2641 2025-06-14 13:54:25.044741 1105
bain1259 2025-06-14 13:54:26.143536 1454
kama1351 2025-06-14 13:54:26.857306 1421
nort2875 2025-06-14 13:54:27.518135 926
sanz1248 2025-06-14 13:54:37.551356 710
pnar1238 2025-06-14 13:54:41.785540 1340
orko1234 2025-06-14 13:54:43.576503 806
komn1238 2025-06-14 13:54:44.708051 1503
savo1255 2025-06-14 13:54:46.889652 564
port1286 2025-06-14 13:54:49.256550 554
sout2856 2025-06-14 13:54:50.185939 941
teop1238 2025-06-14 13:54:52.503506 667
kark1256 2025-06-14 13:54:52.774417 887


  except KeyError: fe_score = fe_scores[table] = -np.log(fisher_exact(table, alternative='greater')[1])


ligh1234 2025-06-14 13:54:52.871246 459
ruul1235 2025-06-14 13:55:02.663705 1297
vera1241 2025-06-14 13:55:04.163825 771
texi1237 2025-06-14 13:55:06.298937 613
tsim1256 2025-06-14 13:55:07.824600 669
sumi1235 2025-06-14 13:55:08.657437 735
svan1243 2025-06-14 13:55:09.550826 1224
sadu1234 2025-06-14 13:55:10.984447 969
toto1304 2025-06-14 13:55:12.019754 1050
warl1254 2025-06-14 13:55:12.880729 442
taba1259 2025-06-14 13:55:17.380680 583
urum1249 2025-06-14 13:55:35.610131 976
trin1278 2025-06-14 13:55:50.052480 1420


## AwesomeAlign model

In [None]:
# generate to generated/doreco_aa

In [None]:
# 1 create alignments
from subprocess import call
import os
if not os.path.isdir('./generated/doreco_aa_output'):
    os.mkdir('./generated/doreco_aa_output')
for docname in map(lambda x : x.strip(), open('./doreco_english_doculects.txt')):
    print(docname, datetime.now())
    infile = './generated/doreco_bitexts/%s.txt' % docname
    outfile = './generated/doreco_aa_output/%s.aligns' % docname
    
    call(['awesome-align', '--output_file=%s' % outfile, '--model_name_or_path=bert-base-multilingual-cased', '--data_file=%s' % infile,
          '--extraction', 'softmax', '--batch_size', '32'])

In [68]:
# 2 extract right lemmas
for docname in map(lambda x : x.strip(), open('./doreco_english_doculects.txt')):
    A = [[tuple(map(int, x.split('-'))) for x in l.strip().split()] for l in open('./generated/doreco_aa_output/%s.aligns' % docname)]
    E,F = zip(*[([(w.split('/')[2], w.split('/')[3] in good_pos and w.split('/')[2] not in excluded_lemmas) for w in l.strip('\n').split(' ||| ')[1].split() 
                  if w.count('/') >= 3],
                  l.strip('\n').split(' ||| ')[0].split())
                 for l in open('./generated/doreco_bitexts/%s.spc' % (docname))])
    print(docname, len(A), len(E), len(F), datetime.now())
    hits= []
    map_counts = defaultdict(lambda : Counter())
    for i in range(len(A)):
        maps = {}
        #if i < 10: print(i, '|', len(E[i]), len(A[i]), '|', max(a[1] for a in A[i]), max(a[0] for a in A[i]), A[i])
        for s,r in A[i]:
            maps[r] = maps.get(r, []) + [s]
        for j,(r,good) in enumerate(E[i]):
            if not good: continue
            hits.append(len(maps.get(j,[]))==2)
            for s in maps.get(j,[]):
                map_counts[r][F[i][s]] += 1

    tes, te_words = defaultdict(lambda : defaultdict(lambda : [])), defaultdict(lambda : defaultdict(lambda : Counter()))
    for i in range(len(A)):
        maps = {}
        for s,r in A[i]:
            #print(E[i][r], F[i][s])
            maps[r] = maps.get(r, []) + [s]
        for j,(r,good) in enumerate(E[i]):
            if not good: continue 
            s = max(maps.get(j,[]), key = lambda s : map_counts[r][F[i][s]], default = None)
            if s != None:
                tes[r][F[i][s]].append(i)
                te_words[r][F[i][s]][F[i][s]] += 1
            
        #print('='*20)
    print(sum(hits), np.mean(hits))
    with open('./generated/doreco_aa_output/%s.json' % (docname),'w') as fout:
        fout.write(json.dumps({'tes' : tes, 'te_words' : te_words}))

anal1239 2855 2855 2855 2025-06-14 13:52:33.498427
661 0.09971338060039221
apah1238 3268 3268 3268 2025-06-14 13:52:33.555471
327 0.077524893314367
arap1274 3230 3230 3230 2025-06-14 13:52:33.759950
647 0.07553998832457677
bain1259 2932 2932 2932 2025-06-14 13:52:33.830597
644 0.09248886974005457
beja1238 6330 6330 6330 2025-06-14 13:52:33.911204
835 0.07755898198030838
cabe1245 2058 2058 2058 2025-06-14 13:52:33.992223
808 0.12986178077788493
cash1254 1766 1766 1766 2025-06-14 13:52:34.148127
508 0.07718018839258584
dolg1241 2429 2429 2429 2025-06-14 13:52:34.214448
1236 0.12395948249924782
even1259 2311 2311 2311 2025-06-14 13:52:34.288019
620 0.1000645577792124
goem1240 2606 2606 2606 2025-06-14 13:52:34.362691
1219 0.10547719996538894
goro1270 4333 4333 4333 2025-06-14 13:52:34.448296
712 0.07926965041193498
guri1247 1326 1326 1326 2025-06-14 13:52:34.611473
298 0.0837549184935357
hoch1243 972 972 972 2025-06-14 13:52:34.643450
389 0.0968384366442619
jeha1242 1332 1332 1332 2025-06

## Eflomal

In [65]:
# 1 create alignments
from subprocess import call
import os
if not os.path.isdir('./generated/doreco_ef_output'):
    os.mkdir('./generated/doreco_ef_output')
for docname in map(lambda x : x.strip(), open('./doreco_english_doculects.txt')):
    print(docname, datetime.now())
    infile = './generated/doreco_bitexts/%s.txt' % docname
    fwdfile = './generated/doreco_ef_output/%s.fwd' % docname
    revfile = './generated/doreco_ef_output/%s.rev' % docname
    
    call(['eflomal-align', '-i', infile, '-f', fwdfile, '-r', revfile, '-m', '3'])
    with open('./generated/doreco_ef_output/%s.gdfa' % docname,'w') as fout:
        dump = open('dump.txt', 'w')
        call(['./atools', '-i', fwdfile, '-j', revfile, '-c', 'grow-diag-final-and'], stdout=fout, stderr=dump)
        dump.close()
        os.remove('dump.txt')

anal1239 2025-06-14 13:18:02.827023
apah1238 2025-06-14 13:18:03.402415
arap1274 2025-06-14 13:18:03.832117
bain1259 2025-06-14 13:18:04.498916
beja1238 2025-06-14 13:18:05.132296
cabe1245 2025-06-14 13:18:05.683570
cash1254 2025-06-14 13:18:06.458669
dolg1241 2025-06-14 13:18:07.139121
even1259 2025-06-14 13:18:08.182965
goem1240 2025-06-14 13:18:08.818280
goro1270 2025-06-14 13:18:10.077142
guri1247 2025-06-14 13:18:10.805136
hoch1243 2025-06-14 13:18:11.378240
jeha1242 2025-06-14 13:18:12.145142
jeju1234 2025-06-14 13:18:12.669839
kaka1265 2025-06-14 13:18:13.360523
kama1351 2025-06-14 13:18:14.821348
kark1256 2025-06-14 13:18:15.633825
komn1238 2025-06-14 13:18:16.308339
ligh1234 2025-06-14 13:18:17.054584
movi1243 2025-06-14 13:18:17.582431
ngal1292 2025-06-14 13:18:18.132541
nngg1234 2025-06-14 13:18:18.496929
nort2641 2025-06-14 13:18:19.162596
nort2875 2025-06-14 13:18:20.193225
orko1234 2025-06-14 13:18:20.681578
pnar1238 2025-06-14 13:18:21.373883
port1286 2025-06-14 13:18:23

In [66]:
# 2 extract right lemmas
for docname in map(lambda x : x.strip(), open('./doreco_english_doculects.txt')):
    A = [[tuple(map(int, x.split('-'))) for x in l.strip().split()] for l in open('./generated/doreco_ef_output/%s.gdfa' % docname)]
    E,F = zip(*[([(w.split('/')[2], w.split('/')[3] in good_pos and w.split('/')[2] not in excluded_lemmas) for w in l.strip('\n').split(' ||| ')[1].split() 
                  if w.count('/') >= 3],
                  l.strip('\n').split(' ||| ')[0].split())
                 for l in open('./generated/doreco_bitexts/%s.spc' % (docname))])
    print(docname, len(A), len(E), len(F), datetime.now())
    hits= []
    map_counts = defaultdict(lambda : Counter())
    for i in range(len(A)):
        maps = {}
        #if i < 10: print(i, '|', len(E[i]), len(A[i]), '|', max(a[1] for a in A[i]), max(a[0] for a in A[i]), A[i])
        for s,r in A[i]:
            maps[r] = maps.get(r, []) + [s]
        for j,(r,good) in enumerate(E[i]):
            if not good: continue
            hits.append(len(maps.get(j,[]))==2)
            for s in maps.get(j,[]):
                map_counts[r][F[i][s]] += 1

    tes, te_words = defaultdict(lambda : defaultdict(lambda : [])), defaultdict(lambda : defaultdict(lambda : Counter()))
    for i in range(len(A)):
        maps = {}
        for s,r in A[i]:
            #print(E[i][r], F[i][s])
            maps[r] = maps.get(r, []) + [s]
        for j,(r,good) in enumerate(E[i]):
            if not good: continue 
            s = max(maps.get(j,[]), key = lambda s : map_counts[r][F[i][s]], default = None)
            if s != None:
                tes[r][F[i][s]].append(i)
                te_words[r][F[i][s]][F[i][s]] += 1
            
        #print('='*20)
    print(sum(hits), np.mean(hits))
    with open('./generated/doreco_ef_output/%s.json' % (docname),'w') as fout:
        fout.write(json.dumps({'tes' : tes, 'te_words' : te_words}))

anal1239 2855 2855 2855 2025-06-14 13:19:15.072404
564 0.08508070598883694
apah1238 3268 3268 3268 2025-06-14 13:19:15.127649
204 0.04836415362731152
arap1274 3230 3230 3230 2025-06-14 13:19:15.189377
105 0.012259194395796848
bain1259 2932 2932 2932 2025-06-14 13:19:15.259729
269 0.038632773229929626
beja1238 6330 6330 6330 2025-06-14 13:19:15.445002
281 0.02610068734906186
cabe1245 2058 2058 2058 2025-06-14 13:19:15.527914
472 0.07585985213757634
cash1254 1766 1766 1766 2025-06-14 13:19:15.586204
330 0.05013673655423884
dolg1241 2429 2429 2429 2025-06-14 13:19:15.654454
538 0.05395647377394444
even1259 2311 2311 2311 2025-06-14 13:19:15.840584
279 0.045029051000645574
goem1240 2606 2606 2606 2025-06-14 13:19:15.914285
189 0.01635372501514234
goro1270 4333 4333 4333 2025-06-14 13:19:16.002254
305 0.03395680249387664
guri1247 1326 1326 1326 2025-06-14 13:19:16.063398
157 0.044125913434513775
hoch1243 972 972 972 2025-06-14 13:19:16.097351
232 0.05775454319143639
jeha1242 1332 1332 1332 