# Imputation notebook

## 1. Libraries

In [2]:
import json
import dill as pickle
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from unidecode import unidecode
from multiprocessing import Pool
import networkx as nx
from itertools import combinations, product
from difflib import SequenceMatcher
import os
from datetime import datetime
#
import tensorflow as tf
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# #
good_pos = {'NOUN','ADJ','VERB'}
excluded_lemmas = {'be', 'other', 'have', 'let', 'one', 'lot', 'same', 'such', 't', 's'}
doculects = list(map(lambda x : x.strip('\n'), open('./doreco_english_doculects.txt')))

## 2. Data Import

In [4]:
# files
dataset = 'doreco'
if dataset == 'doreco':
    metadata = '/home/xxxx/Google Drive/data/doreco_v2/doreco_files_metadata.csv'
    doculects = [tl.strip() for tl in open('./doreco_english_doculects.txt')]
    freq_threshold = 3
else:
    doculects = ['es', 'fi', 'tr']
    freq_threshold = 30

bert_indices = './generated/%s_bert/indices.p' % dataset
bert_input_ids = './generated/%s_bert/input_ids.p' % dataset
te_folder = './generated/%s_output/' % dataset
bert_doc = './generated/%s_bert/' % dataset
prepared_vectors_doc = './generated/%s_bert/' % dataset
bitext_dir = './generated/%s_bitexts/' % dataset

In [5]:
indices = pickle.load(open(bert_indices, 'rb'))
input_ids = pickle.load(open(bert_input_ids, 'rb'))

2025-06-14 15:25:31.667423: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [6]:
def string_mergable(ci,cj):
    match_ij = SequenceMatcher(None, ci, cj).find_longest_match()
    return match_ij.size >= 3 and match_ij.size/len(ci) >= 0.5 and match_ij.size/len(cj) >= 0.5

def distrib_mergable(wi,wj):
    return len(set(wi) & set(wj)) / max(len(set(wi)),len(set(wj))) >= 0.5

def merge_lemmas(tl_json):
    tes = tl_json['tes']
    te_words = tl_json['te_words']
    mergers = nx.Graph()
    merger_dict = {}
    for sw in tes:
        for ci in tes[sw]:
            mergers.add_node((sw,ci), freq=sum(te_words[sw][ci].values()))
        for ci,cj in combinations(tes[sw],2):
            if string_mergable(ci,cj):
                mergers.add_edge((sw,ci), (sw,cj))
    for swi,swj in combinations(tes, 2):
        for ci,cj in product(tes[swi],tes[swj]):
            if string_mergable(ci,cj) and distrib_mergable(te_words[swi][ci], te_words[swj][cj]):
                mergers.add_edge((swi,ci), (swj,cj))
    for c in nx.connected_components(mergers):
        for ci in c: 
            cmax = max(c, key = nx.get_node_attributes(mergers, 'freq').get)
            merger_dict[ci] = cmax
    return merger_dict

In [7]:
def get_vectors(d):
    X, builder = np.zeros((0,768)), []
    #
    T = json.loads(open('%s/%s.json' % (te_folder, d), 'rb').read())
    tes, te_words = T['tes'], T['te_words']
    
    #
    #i_to_fl = [(f,l) for f in corpus[d] for l in corpus[d][f]]
    corpus = [[f.strip('\n').split(' ||| ')[0].split(), [w.split('/') for w in f.strip('\n').split(' ||| ')[1].split()]] 
              for f in open(bitext_dir + d + '.spc')]
    if dataset == 'opus': 
        matched = pd.read_excel('./files/' + d + '_matched_files.xlsx')
        orig_files = {'el/%s/%s/%s.gz' % (r.tl_y, r.tl_f, r.tl_v) for i,r in matched.iterrows()}
        files = [[r['la' if d < 'en' else 'lb'],r['la' if d < 'en' else 'lb'] in orig_files] 
                 for i,r in pd.read_excel(bitext_dir + d + '_bitext_metadata.xlsx').iterrows()]
    mergers = merge_lemmas(T)
    #
    offset = 0
    pvd_ct = 0
    for i in range(100):
        if not os.path.isfile(bert_doc + d + '_' + str(i) + '.npy'): break
        vectors = np.load('%s/%s_%d.npy' % (bert_doc, d, i))[0]
        corpus_lines = sorted(set(indices[d][offset:offset+vectors.shape[0]]))[1:-1]
        for li in corpus_lines: 
            if li % 500 == 0: print(i, li, files[li] if dataset == 'opus' else '', datetime.now())
            bert_indices = (np.where(np.array(indices[d]) == li)[0])
            bert_inputs  = np.array(input_ids[d])[0][bert_indices]
            seen_sw = set()
            for w in filter(lambda w : unidecode(w[2].lower()) in tes and w[3] in good_pos, corpus[li][1]):
                sw = unidecode(w[2].lower())
                sw_o = w[2]
                mrk = next((mergers[sw,mrk] for mrk in tes[sw] if li in tes[sw][mrk]),None)
                #print(i,li,sw, mrk)
                #mrk = next((mrk for mrk in tes[sw] if li in tes[sw][mrk]),None)
                if mrk == None: continue
                #   
                if sw in seen_sw: continue
                seen_sw.add(sw)
                #
                bert_target = tokenizer.encode(sw_o)[1]
                bert_index = next((bix-offset for j,bix in enumerate(bert_indices) if bert_inputs[j]== bert_target),None)
                if bert_index == None: 
                    bert_index = next((bix-offset for j,bix in enumerate(bert_indices) if sw_o in tokenizer.decode(bert_inputs[j])),None)
                if bert_index != None:
                    X = np.vstack([X, vectors[bert_index].reshape(1,-1)])
                    if dataset == 'opus':
                        builder.append({'term' : sw, 'line' : li, 'marker' : mrk, 'original' : files[li][1]})
                    else:
                        builder.append({'term' : sw, 'line' : li, 'marker' : mrk})
                    # print(i, li, sw, mrk, bert_target, bert_index)
            if len(X) > 250000: 
                np.save('%s/%s_by_lemma_%d.npy' % (prepared_vectors_doc, d, pvd_ct), np.array(X))
                pvd_ct += 1
                X = np.zeros((0,768))
                
        #
        offset += vectors.shape[0]
        if pvd_ct > 0: break

    np.save('%s/%s_by_lemma_%d.npy' % (prepared_vectors_doc, d, pvd_ct), np.array(X))
    df = pd.DataFrame(builder)
    df.to_excel('%s/%s_by_lemma.xlsx' % (prepared_vectors_doc, d))

In [8]:
regen_vectors = True
if regen_vectors:
    for d in doculects:
        print(d)
        get_vectors(d)

anal1239
0 500  2025-06-14 15:25:54.788367
0 1000  2025-06-14 15:25:56.480064
0 1500  2025-06-14 15:25:59.120519
0 2000  2025-06-14 15:26:02.300909
0 2500  2025-06-14 15:26:06.156668
apah1238
0 500  2025-06-14 15:26:14.707489
0 1000  2025-06-14 15:26:15.149046
0 1500  2025-06-14 15:26:15.735143
0 2000  2025-06-14 15:26:16.557496
0 2500  2025-06-14 15:26:17.475350
0 3000  2025-06-14 15:26:18.524599
arap1274
0 500  2025-06-14 15:26:43.541184
0 1000  2025-06-14 15:26:44.948913
0 1500  2025-06-14 15:26:46.737621
0 2000  2025-06-14 15:26:48.783254
0 2500  2025-06-14 15:26:51.297972
0 3000  2025-06-14 15:26:57.143306
bain1259
0 500  2025-06-14 15:27:17.050890
0 1000  2025-06-14 15:27:18.311925
0 1500  2025-06-14 15:27:20.077427
0 2000  2025-06-14 15:27:22.255022
0 2500  2025-06-14 15:27:24.540185
beja1238
0 500  2025-06-14 15:27:47.778292
0 1000  2025-06-14 15:27:48.640087
0 1500  2025-06-14 15:27:49.778942
0 2000  2025-06-14 15:27:51.059531
0 2500  2025-06-14 15:27:52.474541
0 3000  2025-06