# Bilingual dictionary enrichment via graph completion

Current

In [1]:
import logging
import sys
import os

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
import json
import re

In [None]:
import numpy as np

In [2]:
from numpy import vectorize

In [2]:
from collections import Counter
from math import exp

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import networkx as nx
import xml.etree.ElementTree as ET
import requests

In [5]:
import matplotlib.pyplot as plt

## Language codes

In [None]:
from numpy import na
import pandas as pd
lang_codes = pd.read_csv('./files/language-codes-full_csv.csv', na_values = 0)
lang_codes = lang_codes[['alpha3-b','alpha2']]
lang_codes = lang_codes.dropna()


lang_codes = [{i[0]:i[1] for i in np.array(lang_codes)}, {i[1]:i[0] for i in np.array(lang_codes)}]

with open ('./files/lang_codes.json', 'w') as f:
    json.dump(lang_codes, f)

In [5]:
with open ('./files/lang_codes.json', 'r') as f:
    lang_codes = json.load(f)

def l(lang, mode=3):
    mode = mode % 2
    if len(lang)==2:
        if lang in lang_codes[mode]:
            return lang_codes[mode][lang]
        else:
            return lang
    else:
        return lang
l('tt', 3)

'tat'

## Loading dictionaries

### PyGithub

** Load user with login and password from secret file **

In [5]:
from github import Github

with open ('secure.json') as f:
    SECRET = json.loads(f.read())

github = Github(SECRET['USER'], SECRET['PASSWORD'])

user = github.get_user('apertium')

In [None]:
user.get_repos()

** Generator ** : yield all repos that match name pattern

In [6]:
def repo_names(user):
    for repo in user.get_repos():
        if re.match('apertium-[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?', repo.name):
            yield repo.name

Looks like heavy function. But I don't see any improvements yet, except for having certain repo for all bidix copies. But this one above is the most up-to-date. It filters not languages pair repos, it is needed not to look for bidix where it can't be. Function saves a lot of time.

In [18]:
%time w = list(repo_names(user))

Wall time: 26.8 s


** Find bidix **

Length sorting to reduce number of files to check (bidix is lone of the longest)

In [7]:
def bidix_url(repo):
    for i in sorted(repo.get_dir_contents('/'), key = lambda x: (len(x.path), 1000-ord(('   '+x.path)[-3])), reverse=True):
        if re.match('apertium-.*?\.[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?.dix$', i.path):
            return i.download_url
        elif len(i.path) < 23:
            return None

In [27]:
%time bidix_url(github.get_repo(user.name+'/'+w[22]))

Wall time: 709 ms


'https://raw.githubusercontent.com/apertium/apertium-cat-srd/master/apertium-cat-srd.cat-srd.dix'

** Only relevant for certain language pair **

There are **164 ** pairs at this moment

In [91]:
def download_all_bidixes(user):
    logging.info('Start')
    if not os.path.exists('./dictionaries/'):
        os.makedirs('./dictionaries/')
    for repo_name in repo_names(user):
        bidix = bidix_url(github.get_repo(user.name+'/'+repo_name))
        langs = [l(i) for i in repo_name.split('-')[1:]]
        filename = './dictionaries/'+'-'.join(langs)+'.dix'
        if bidix:
            response = requests.get(bidix)
            response.encoding = 'UTF-8'
            with open(filename, 'w', encoding='UTF-8') as f:
                f.write(response.text)
    logging.info('Finish')

In [92]:
download_all_bidixes(user)

2018-05-18 11:22:17,854 | INFO : Start
2018-05-18 11:30:11,682 | INFO : Finish


In [18]:
def get_relevant_languages(l1, l2):
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    pair = [l(l1), l(l2)]
    with open('language_list.csv','w', encoding='utf-8') as f:
        nodes = set()
        for i in range(1,5):
            w = nx.single_source_shortest_path_length(G, pair[0], cutoff=i)
            v = nx.single_source_shortest_path_length(G, pair[1], cutoff=i)
            H = G.subgraph(w.keys())
            H.remove_node(pair[0])
            H2 = G.subgraph(v.keys())
            H2.remove_node(pair[1])
            if pair[1] in H.nodes():
                v = nx.node_connected_component(H, pair[1])
            else:
                v = set()
            if pair[0] in H2.nodes():
                w = nx.node_connected_component(H, pair[1])
            else:
                w = set() 
            nodes2 = v & w | set([pair[0], pair[1]])
            nodes2 = nodes2 - nodes
            for node in nodes2:
                f.write('{}\t{}\n'.format(i*2, node))
            nodes = nodes | nodes2

In [16]:
get_relevant_languages('bel', 'rus')

In [105]:
def check_languages():
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files:
            #print (root+fl)
            try:
                s = ET.parse(root+fl)
            except:
                print ('ERROR :'+fl)

In [106]:
check_languages()

ERROR :epo-bul.dix
ERROR :epo-per.dix
ERROR :epo-pol.dix
ERROR :fin-fra.dix
ERROR :pol-lav.dix
ERROR :sah-eng.dix


In [7]:
def existance(pair, nodes):
    if pair[0] in nodes and pair[1] in nodes:
        return True
    else:
        return False

In [9]:
def load_chosen():
    with open ('language_list.csv','r',encoding='utf-8') as f:
        languages = set([i.split('\t')[1].strip() for i in f.readlines()])
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files:
            pair = fl.replace('.dix','').split('-')
            if existance(pair, languages):
                try:
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        dictionary = d.read().replace('<b/>',' ').replace('<.?g>','')
                        yield ET.fromstring(dictionary), pair[0], pair[1]
                except:
                    print ('ERROR: ', fl)

In [7]:
%time len(list(load_chosen()))

ERROR:  epo-bul.dix
ERROR:  epo-per.dix
ERROR:  epo-pol.dix
ERROR:  fin-fra.dix
ERROR:  pol-lav.dix
ERROR:  sah-eng.dix
Wall time: 3min 47s


269

## Object classes

** Word **

- lemma : lemma
- lang : language
- pos : part of speech

In [25]:
class Word:
    def __init__(self, lemma, lang, s=[]):
        self.lemma = lemma
        self.lang = lang
        self.s = s
    
    def __str__(self):
        return str(self.lang)+'_'+str(self.lemma)+'_'+str('-'.join(self.s))
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and self.s == other.s
    
    def __hash__(self):
        return hash(str(self))

## Parsing

### Bidix parsing

In [11]:
%time T = tree('https://raw.githubusercontent.com/apertium/apertium-eng-ita/master/apertium-eng-ita.eng-ita.dix')

Wall time: 921 ms


In [11]:
def one_word(word, lang):
    s = word.findall('.//s')
    s = [i.attrib['n'] for i in s]
    return Word(word.text, lang, s)

In [12]:
def parse_bidix (tree, l1, l2):
    tree = tree.find('section')
    if not tree:
        print (l1, l2)
    else:
        for e in tree:
            if 'n' in e.attrib:
                side = e.attrib['n']
            else:
                side = None
            p = e.find('p')
            if p:
                yield one_word(p.find('l'), l1), one_word(p.find('r'), l2), side
            else:
                try:
                    i = e.find('i')
                    yield one_word(i, l1), one_word(i, l2), side
                except:
                    pass

In [147]:
% time len(list(parse_bidix (T, 'bel','rus')))

Wall time: 882 ms


48880

In [13]:
def add_bidix(G, T, l1, l2):
    for word1, word2, side in parse_bidix (T, l1, l2):
        if side == None:
            G.add_edge(word1, word2)
            G.add_edge(word2, word1)
        elif side == 'LR':
            G.add_edge(word1, word2)
        elif side == 'RL':
            G.add_edgr(word2, word1)
        else:
            print (side)

In [145]:
G = nx.DiGraph()
%time add_bidix(G, T, 'bel', 'rus')

Wall time: 1.21 s


In [14]:
G = nx.DiGraph()
logging.info('Start')
for T, l1, l2 in load_chosen():
    add_bidix(G, T, l1, l2)
logging.info('Finish')

2018-05-20 14:31:55,751 | INFO : Start
ERROR:  epo-bul.dix
ERROR:  epo-per.dix
ERROR:  epo-pol.dix
ERROR:  fin-fra.dix
lit lav
ERROR:  pol-lav.dix
ERROR:  sah-eng.dix
2018-05-20 14:35:14,629 | INFO : Finish


In [24]:
class SetWithFilter(set):
    def lemma(self, value):
        return set(i for i in self if i.lemma == value)
    def pos(self, value):
        return set(i for i in self if i.pos == value)
    def lang(self, value):
        return set(i for i in self if i.lang == value)
    def notlang(self, value):
        return set(i for i in self if i.lang != value)

In [22]:
def dictionaries(G, l1,l2):
    l1 = l(l1)
    l2 = l(l2)
    d_l1 = SetWithFilter()
    d_l2 = SetWithFilter()
    for i in G.nodes():
        if i.lang == l1:
            d_l1.add(i)
        elif i.lang == l2:
            d_l2.add(i)
    return d_l1, d_l2

In [30]:
def lemma_search (G, lemma, d_l1, l2, cutoff):
    lemmas = d_l1.lemma(lemma)
    print (lemmas)
    results = {str(word):{} for word in lemmas}
    for word in lemmas:
        print(word, end='\t')
        s = SetWithFilter(nx.single_source_shortest_path_length(G, word, cutoff=cutoff))
        print ('all: ', str(len(s)), end='\t')
        s = s.lang(l2)
        print ('filtered: ', str(len(s)))
        for translation in s:
            t = list(nx.all_simple_paths(G, word, translation, cutoff=cutoff))
            t = [len(i) for i in t]
            t = Counter(t)
            coef = 0
            for i in t:
                coef += exp(-t[i])
            results[str(word)][str(translation)] = coef
    return results

In [19]:
def print_results(results):
    for i in results:
        print ('\n\t\t', i)
        for j in sorted(results[i], key=results[i].get, reverse=True)[:7]:
            print (j, results[i][j])

### RUS-FRA

In [20]:
get_relevant_languages('rus', 'fra')

In [17]:
G = nx.DiGraph()
logging.info('Start')
for T, l1, l2 in load_chosen():
    add_bidix(G, T, l1, l2)
logging.info('Finish')

2018-05-20 11:26:48,971 | INFO : Start
ERROR:  epo-bul.dix
ERROR:  epo-per.dix
ERROR:  epo-pol.dix
ERROR:  fin-fra.dix
lit lav
ERROR:  pol-lav.dix
ERROR:  sah-eng.dix
2018-05-20 11:29:36,417 | INFO : Finish


In [25]:
import pickle as pkl
with open('graph.pkl','wb') as f:
    pkl.dump(G, f)

In [8]:
import pickle as pkl
with open('graph.pkl','rb') as f:
    G = pkl.load(f)

In [26]:
d_l1, d_l2 = dictionaries(G, 'rus','fra')

In [27]:
def check_lemma (G, lemma, d_l1, l2):
    lemmas = d_l1.lemma(lemma)
    print (lemmas)
    results = {str(word):{} for word in lemmas}
    for word in lemmas:
        print (word)
        for cutoff in range(1, 8):
            print (cutoff, end='\t')
            s = SetWithFilter(nx.single_source_shortest_path_length(G, word, cutoff=cutoff))
            print ('all: ', str(len(s)), end='\t')
            s = s.lang(l2)
            print ('filtered: ', str(len(s)))
            if len(s)>150:
                break

In [63]:
%time print_results(lemma_search (G, 'собака', d_l1, 'fra', 4))

{rus_собака_n-f, rus_собака_n, rus_собака_n-f-aa}
rus_собака_n-f	all:  252	filtered:  3
rus_собака_n	all:  800	filtered:  20
rus_собака_n-f-aa	all:  2499	filtered:  47

		 rus_собака_n-f
fra_chien_n-GD 0.7357588823428847
fra_chien_n 0.3861950800601765
fra_chien_n-m 0.36787944117144233

		 rus_собака_n
fra_chien_n 0.5032208686204084
fra_chien_n-m 0.503214724408055
fra_colimaçon_n-m 0.4176665095393063
fra_chien_n-GD 0.4176665095393063
fra_limaçon_n-m 0.3861950800601765
fra_singe_n-m 0.36787944117144233
fra_goal_n-m 0.36787944117144233

		 rus_собака_n-f-aa
fra_vigile_n-m 0.503214724408055
fra_surveillant_n-f 0.503214724408055
fra_surveillant_n-m 0.503214724408055
fra_chien_n-GD 0.4180019721672088
fra_chien_n 0.3703581933481431
fra_chien_n-m 0.36879132313699686
fra_haillon_n-m 0.36787944117144233
Wall time: 618 ms


In [94]:
%time print_results(lemma_search (G, 'поле', d_l1, 'fra', 3))

{rus_поле_n, rus_поле_n-nt-nn, rus_поле_n-nt-nn-pl, rus_поле_n-nt}
rus_поле_n	all:  299	filtered:  7
rus_поле_n-nt-nn	all:  831	filtered:  12
rus_поле_n-nt-nn-pl	all:  3	filtered:  0
rus_поле_n-nt	all:  148	filtered:  2

		 rus_поле_n
fra_camp_n-m 0.7357588823428847
fra_champ_n-m 0.7357588823428847
fra_marge_n-m 0.7357588823428847
fra_plantation_n-f 0.7357588823428847
fra_terrain_n-m 0.36787944117144233
fra_berge_n-f 0.1353352832366127
fra_rive_n-f 0.1353352832366127

		 rus_поле_n-nt-nn
fra_camp_n-m 0.3746173881705278
fra_champ_n-m 0.3746173881705278
fra_camp_n 0.36787944117144233
fra_terrain_n-m 0.36787944117144233
fra_patinoire_n-f 0.36787944117144233
fra_cadre_n 0.36787944117144233
fra_champ_n 0.36787944117144233

		 rus_поле_n-nt-nn-pl

		 rus_поле_n-nt
fra_champ_n-m 0.36787944117144233
fra_camp_n-m 0.36787944117144233
Wall time: 3.55 s


In [96]:
%time print_results(lemma_search (G, 'serpent', d_l2, 'rus', 4))

{fra_serpent_n-m}
fra_serpent_n-m	all:  767	filtered:  10

		 fra_serpent_n-m
rus_змей_n 0.5035501870359576
rus_змея_n 0.5035501870359576
rus_змейка_n-f-nn 0.3861950800601765
rus_змея_n-f-aa 0.37461750070570254
rus_уж_n 0.3746173881705278
rus_змей_n-m-aa 0.3703581933481087
rus_гадюка_n-f-aa 0.36787944117144233
Wall time: 6.02 s


In [95]:
for node in G.nodes():
    if len(node.s) > 6:
        print (node)

ben_prpers_prn-p3-infml-aa-mf-sg-gen
cym_rhywun_prn-tn-m-sg-tn-m-sg
kaz_сіз_prn-pers-p2-sg-frm-gen-subst-nom
eng_you're_prn-subj-p2-mf-sp-vbser-pres
sco_ye're_prn-subj-p2-mf-sp-vbser-pres
fin_sama_adj-pos-sg-ess-n-sg-ess
hbs_na_pr-acc-prn-pers-clt-p3-m-sg-acc
hin_वह_prn-dem-p3-mf-sg-dst-nom
hin_वह_prn-dem-p3-mf-pl-dst-nom
ita_Milà_np-cog-cog-cog-cog-mf-sp


### Word tags

In [95]:
d = {}
for i in d_l1:
    if i.lemma not in d:
        d[i.lemma] = set()
    d[i.lemma].add('_'.join(i.s))

In [100]:
a = [' | '.join(list(sorted(d[i]))) for i in d if len(d[i])>1]
print(len(a)/len(d_l1))
a = Counter(a)

0.17975050961623681


In [91]:
for i in sorted(a, key=a.get, reverse=True)[:20]:
    print (i, a[i])

n | n_m_nn 2841
n | n_f_nn 2549
n | n_m_aa 1178
adj_sint | n 1149
n | n_nt_nn 921
adj | adj_sint | n 586
adj | adj_sint 382
n | vblex_perf | vblex_perf_tv 353
n | vblex_perf_tv 344
n | n_f | n_f_nn 287
n | n_m | n_m_nn 265
vblex_perf | vblex_perf_tv 251
adv | n 236
n | vblex_impf 218
n | vblex_impf_tv 210
n | n_f_aa 197
n | vblex_impf | vblex_impf_tv 190
adj | n 183
vblex_impf | vblex_impf_tv 154
n_m_aa | n_m_nn 132


In [104]:
d = {}
for i in d_l2:
    if i.lemma not in d:
        d[i.lemma] = set()
    d[i.lemma].add('_'.join(i.s))

In [110]:
c = [d[i] for i in d if 'n' in d[i]]

In [115]:
len(c)/len([i.lemma for i in d_l2 if 'n' in i.s])

0.2980614543114543

In [102]:
a = [' | '.join(list(sorted(d[i]))) for i in d if len(d[i])>1]
print(len(a)/len(d_l2))
a = Counter(a)
for i in sorted(a, key=a.get, reverse=True)[:20]:
    print (i, a[i])

0.17001942380570761
n | n_f 2496
n | n_m 1651
np | np_cog_mf_sp 811
adj | n 583
adj | adj_mf 386
np | np_ant | np_ant_m_sg 330
np | np_cog 325
np | np_loc_f 311
np | np_ant | np_ant_f_sg 288
adj | n_m 257
np | np_ant 230
adj | adj_GD | adj_f | adj_m 193
adj | n | n_m 172
n_f | n_m 166
np | np_ant_m_sg 118
n | n_mf 117
np_ant_f_sg | np_cog_mf_sp 98
np | np_cog | np_cog_mf_sp 84
adj | n | n_mf 83
num | num_mf_sp 83


## Evaluation

- for every bidix
- get relevant
- try 4 max
- exclude pair (skip)
- for every word in actual bidix find best translation
- compare accuracy

In [20]:
def relevant (l1, l2):
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    pair = [l(l1), l(l2)]
    nodes = set()
    i = 2
    w = nx.single_source_shortest_path_length(G, pair[0], cutoff=i)
    v = nx.single_source_shortest_path_length(G, pair[1], cutoff=i)
    H = G.subgraph(w.keys())
    H.remove_node(pair[0])
    H2 = G.subgraph(v.keys())
    H2.remove_node(pair[1])
    if pair[1] in H.nodes():
        v = nx.node_connected_component(H, pair[1])
    else:
        v = set()
    if pair[0] in H2.nodes():
        w = nx.node_connected_component(H, pair[1])
    else:
        w = set() 
    nodes = v & w | set([pair[0], pair[1]])
    if len(nodes) > 20 and len(nodes) < 40:
        return nodes

In [17]:
for root, dirs, files in os.walk ('./dictionaries/'):
    for fl in files :
        pair = fl.replace('.dix', '').split('-')
        one_comparison(pair[0], pair[1])

arg cat 35
bel rus 35
chv rus 25
chv tat 25
chv tur 25
cos ita 36
eus fin 30
eus sme 30
grn spa 32
guc spa 32
kir uzb 21
kpv fin 27
krl olo 26
liv fin 26
mrj fin 26
myv fin 26
oci cat 36
oci fra 36
oci spa 36
olo fin 26
quz spa 32
rum ita 35
scn spa 32
udm kpv 36
udm rus 36
wel spa 32
zho spa 32


In [None]:
def target_dictionaries(l1, l2):
    parse_bidix (tree, l1, l2)

In [None]:
def word compare(G, d1, d2, d1_t, d2_t):
    for word in d1:
        if word in G.nodes():
            

In [19]:
def one_comparison(l1_m, l2_m):
    logging.info('Start\t'+'{}\t{}'.format(l1, l2))
    nodes = relevant (l1_m, l2_m)
    G = nx.DiGraph()
    logging.info('Start loading')
    for T, l1, l2 in load_chosen():
        if (l1_m != l1 and l2_m != 1l) and (l2_m != l1 and l1_m != l2)
            add_bidix(G, T, l1, l2)
    logging.info('Finish loading')
    d1_t, d2_t = 
    