# Imputation notebook

## 1. Libraries

In [1]:
import json
import dill as pickle
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from unidecode import unidecode
from multiprocessing import Pool
import networkx as nx
from itertools import combinations, product
from difflib import SequenceMatcher
import os
from datetime import datetime
#
import tensorflow as tf
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# #
good_pos = {'NOUN','ADJ','VERB'}
excluded_lemmas = {'be', 'other', 'have', 'let', 'one', 'lot', 'same', 'such', 't', 's'}
doculects = list(map(lambda x : x.strip('\n'), open('./doreco_doculects.txt')))

2025-04-25 01:28:56.117183: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-25 01:28:56.207107: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745558936.243584 1728444 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745558936.254905 1728444 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 01:28:56.333584: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

## 2. Data Import

In [2]:
# files
dataset = 'doreco'
if dataset == 'doreco':
    metadata = '/home/xxxx/Google Drive/data/doreco_v2/doreco_files_metadata.csv'
    doculects = [tl.strip() for tl in open('./doreco_doculects.txt')]
    freq_threshold = 3
else:
    doculects = ['es', 'fi', 'tr']
    freq_threshold = 30

bert_indices = './generated/%s_bert/indices.p' % dataset
bert_input_ids = './generated/%s_bert/input_ids.p' % dataset
te_folder = './generated/%s_output/' % dataset
bert_doc = './generated/%s_bert/' % dataset
prepared_vectors_doc = './generated/%s_bert/' % dataset
bitext_dir = './generated/%s_bitexts/' % dataset

In [3]:
indices = pickle.load(open(bert_indices, 'rb'))
input_ids = pickle.load(open(bert_input_ids, 'rb'))

2025-04-25 01:29:02.136987: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [4]:
def string_mergable(ci,cj):
    match_ij = SequenceMatcher(None, ci, cj).find_longest_match()
    return match_ij.size >= 3 and match_ij.size/len(ci) >= 0.5 and match_ij.size/len(cj) >= 0.5

def distrib_mergable(wi,wj):
    return len(set(wi) & set(wj)) / max(len(set(wi)),len(set(wj))) >= 0.5

def merge_lemmas(tl_json):
    tes = tl_json['tes']
    te_words = tl_json['te_words']
    mergers = nx.Graph()
    merger_dict = {}
    for sw in tes:
        for ci in tes[sw]:
            mergers.add_node((sw,ci), freq=sum(te_words[sw][ci].values()))
        for ci,cj in combinations(tes[sw],2):
            if string_mergable(ci,cj):
                mergers.add_edge((sw,ci), (sw,cj))
    for swi,swj in combinations(tes, 2):
        for ci,cj in product(tes[swi],tes[swj]):
            if string_mergable(ci,cj) and distrib_mergable(te_words[swi][ci], te_words[swj][cj]):
                mergers.add_edge((swi,ci), (swj,cj))
    for c in nx.connected_components(mergers):
        for ci in c: 
            cmax = max(c, key = nx.get_node_attributes(mergers, 'freq').get)
            merger_dict[ci] = cmax
    return merger_dict

In [5]:
def get_vectors(d):
    X, builder = np.zeros((0,768)), []
    #
    T = json.loads(open('%s/%s.json' % (te_folder, d), 'rb').read())
    tes, te_words = T['tes'], T['te_words']
    
    #
    #i_to_fl = [(f,l) for f in corpus[d] for l in corpus[d][f]]
    corpus = [[f.strip('\n').split(' ||| ')[0].split(), [w.split('/') for w in f.strip('\n').split(' ||| ')[1].split()]] 
              for f in open(bitext_dir + d + '.spc')]
    if dataset == 'opus': 
        matched = pd.read_excel('./files/' + d + '_matched_files.xlsx')
        orig_files = {'el/%s/%s/%s.gz' % (r.tl_y, r.tl_f, r.tl_v) for i,r in matched.iterrows()}
        files = [[r['la' if d < 'en' else 'lb'],r['la' if d < 'en' else 'lb'] in orig_files] 
                 for i,r in pd.read_excel(bitext_dir + d + '_bitext_metadata.xlsx').iterrows()]
    mergers = merge_lemmas(T)
    #
    offset = 0
    pvd_ct = 0
    for i in range(100):
        if not os.path.isfile(bert_doc + d + '_' + str(i) + '.npy'): break
        vectors = np.load('%s/%s_%d.npy' % (bert_doc, d, i))[0]
        corpus_lines = sorted(set(indices[d][offset:offset+vectors.shape[0]]))[1:-1]
        for li in corpus_lines: 
            if li % 500 == 0: print(i, li, files[li] if dataset == 'opus' else '', datetime.now())
            bert_indices = (np.where(np.array(indices[d]) == li)[0])
            bert_inputs  = np.array(input_ids[d])[0][bert_indices]
            seen_sw = set()
            for w in filter(lambda w : unidecode(w[2].lower()) in tes and w[3] in good_pos, corpus[li][1]):
                sw = unidecode(w[2].lower())
                sw_o = w[2]
                mrk = next((mergers[sw,mrk] for mrk in tes[sw] if li in tes[sw][mrk]),None)
                #print(i,li,sw, mrk)
                #mrk = next((mrk for mrk in tes[sw] if li in tes[sw][mrk]),None)
                if mrk == None: continue
                #   
                if sw in seen_sw: continue
                seen_sw.add(sw)
                #
                bert_target = tokenizer.encode(sw_o)[1]
                bert_index = next((bix-offset for j,bix in enumerate(bert_indices) if bert_inputs[j]== bert_target),None)
                if bert_index == None: 
                    bert_index = next((bix-offset for j,bix in enumerate(bert_indices) if sw_o in tokenizer.decode(bert_inputs[j])),None)
                if bert_index != None:
                    X = np.vstack([X, vectors[bert_index].reshape(1,-1)])
                    if dataset == 'opus':
                        builder.append({'term' : sw, 'line' : li, 'marker' : mrk, 'original' : files[li][1]})
                    else:
                        builder.append({'term' : sw, 'line' : li, 'marker' : mrk})
                    # print(i, li, sw, mrk, bert_target, bert_index)
            if len(X) > 250000: 
                np.save('%s/%s_by_lemma_%d.npy' % (prepared_vectors_doc, d, pvd_ct), np.array(X))
                pvd_ct += 1
                X = np.zeros((0,768))
                
        #
        offset += vectors.shape[0]
        if pvd_ct > 0: break

    np.save('%s/%s_by_lemma_%d.npy' % (prepared_vectors_doc, d, pvd_ct), np.array(X))
    df = pd.DataFrame(builder)
    df.to_excel('%s/%s_by_lemma.xlsx' % (prepared_vectors_doc, d))

In [6]:
regen_vectors = True
if regen_vectors:
    for d in doculects:
        print(d)
        get_vectors(d)

apah1238
0 500  2025-04-25 01:29:10.550096
0 1000  2025-04-25 01:29:11.533396
0 1500  2025-04-25 01:29:12.909711
0 2000  2025-04-25 01:29:14.872701
0 2500  2025-04-25 01:29:17.368278
0 3000  2025-04-25 01:29:20.051767
anal1239
0 500  2025-04-25 01:29:39.732677
0 1000  2025-04-25 01:29:40.898012
0 1500  2025-04-25 01:29:42.537785
0 2000  2025-04-25 01:29:45.285086
0 2500  2025-04-25 01:29:50.415787
arap1274
0 500  2025-04-25 01:30:22.957221
0 1000  2025-04-25 01:30:24.787910
0 1500  2025-04-25 01:30:26.990766
0 2000  2025-04-25 01:30:29.608382
0 2500  2025-04-25 01:30:32.781812
0 3000  2025-04-25 01:30:39.726566
beja1238
0 500  2025-04-25 01:31:05.287660
0 1000  2025-04-25 01:31:06.341526
0 1500  2025-04-25 01:31:07.662313
0 2000  2025-04-25 01:31:09.277843
0 2500  2025-04-25 01:31:11.204074
0 3000  2025-04-25 01:31:13.175946
0 3500  2025-04-25 01:31:15.362318
0 4000  2025-04-25 01:31:17.736590
0 4500  2025-04-25 01:31:21.522944
0 5000  2025-04-25 01:31:27.309632
0 5500  2025-04-25 01:3

In [None]:
# def C(constraints, E):
#     cost = 0
#     for S in constraints:
#         cost += (1-ncc(set(S.nonzero()[1]), E) if S.nnz > 0 else 0)
#     return cost

# def ncc(S, E):
#     cc = 0
#     explored = set()
#     for v in S:
#         if v in explored: continue
#         else: cc += 1
            
#         frontier = [v]
#         while len(frontier) > 0:
#             u = frontier.pop()
#             if u in explored: continue
#             explored.add(u)
#             if u not in E: continue
#             frontier.extend(list(S.intersection(E[u])))
#     #print('\t',S, E, cc)

#     return cc

# def find_edges(constraints):
#     # determine possible edges
#     x = constraints.T.dot(constraints)
#     x2 = (x > 1)
#     V2c = Counter()
#     for i in range(x2.shape[1]):
#         for j in x2[:i,i].nonzero()[0]:
#             V2c[i,j] = x[i,j]
#     V2 = set(V2c)
#     print('n edges', len(V2))
#     # initialize edges and weight
#     E = { i : set() for i in np.arange(x.shape[0])}
#     E_weight = {}
#     #
#     total_cost = C(constraints, E)
#     last_best_decrease = max(V2c.values())
#     E_prev = { i : set() for i in np.arange(x.shape[0])}
#     last_scores = {}
#     while total_cost < 0:
#         best_decrease = 0
#         best_edge = (None, None)
#         for i,(u,v) in enumerate(list(V2)):
#             if i % 500 == 0: print('\t', i, len(V2), best_decrease, best_edge)
#             E[u].add(v)
#             E[v].add(u)
#             c = last_scores[u,v] = C(constraints, E)
#             E[u].remove(v)
#             E[v].remove(u)
#             new_decrease = c - total_cost
#             #print('\t\ttry', u,v, '\t', c, total_cost, new_decrease)
#             if new_decrease > best_decrease:
#                 best_decrease = new_decrease
#                 best_edge = (u,v)
#                 if best_decrease == last_best_decrease:
#                     break
#             if new_decrease == 0: V2.remove((u,v))

#         u,v = best_edge
#         print('\t',u,v,best_decrease, total_cost+best_decrease)
#         E[u].add(v)
#         E[v].add(u)
#         V2.remove((u,v))
#         E_weight[u,v] = E_weight[v,u] = best_decrease
#         total_cost += best_decrease
#         last_best_decrease = best_decrease
#         E_prev = {k : v.copy() for k,v in E.items()}
#     return E, E_weight

# def build_semantic_map(constraints):
#     E, E_weight = find_edges(constraints)
    
#     best_edges = []
#     for u in E:
#         best_edges.extend([tuple(sorted((u,v))) for v in E[u]])
#     return (set(range(constraints.shape[0])), set(best_edges), E_weight)

# def show_graph(V, E, E_weight, pos = None, outfile=None):
#     plt.figure(figsize=(20,20))
#     label_dict = {i : v for i,v in enumerate(V)}
#     gr = nx.Graph()
#     gr.add_nodes_from(np.arange(len(V)))
#     gr.add_edges_from(E)
#     if pos == None:
#         pos = nx.spring_layout(gr, k=0.30,iterations=20)
#     nx.draw(gr, pos, labels=label_dict, node_size=1500, node_color='w', font_size=15, with_labels=True)
#     if outfile is None: plt.show()