# Bilingual dictionary enrichment via graph completion

Current

In [1]:
import logging
import sys
import os

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
import json
import re
from collections import Counter
from math import exp

In [2]:
import networkx as nx
import xml.etree.ElementTree as ET

In [3]:
from collections import Counter, defaultdict
from math import exp

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [50]:
import numpy as np
from numpy import vectorize

In [37]:
import pandas as pd

In [4]:
import requests

In [5]:
import matplotlib.pyplot as plt

## Language codes

In [51]:
from numpy import nan
import pandas as pd
lang_codes = pd.read_csv('./files/language-codes-full_csv.csv', na_values = nan, sep='\t', header=0)
lang_codes = lang_codes[['3','2']]
lang_codes = lang_codes.dropna()


lang_codes = [{i[0]:i[1] for i in np.array(lang_codes)}, {i[1]:i[0] for i in np.array(lang_codes)}]

with open ('./files/lang_codes.json', 'w') as f:
    json.dump(lang_codes, f)

In [5]:
with open ('./files/lang_codes.json', 'r') as f:
    lang_codes = json.load(f)

def l(lang, mode=3):
    mode = mode % 2
    if len(lang)==2:
        if lang in lang_codes[mode]:
            return lang_codes[mode][lang]
        else:
            return lang
    else:
        return lang
l('fr', 3)

'fra'

## Loading dictionaries

### PyGithub

** Load user with login and password from secret file **

In [6]:
from github import Github

with open ('secure.json') as f:
    SECRET = json.loads(f.read())

github = Github(SECRET['USER'], SECRET['PASSWORD'])

user = github.get_user('apertium')

In [None]:
user.get_repos()

** Generator ** : yield all repos that match name pattern

In [7]:
def repo_names(user):
    for repo in user.get_repos():
        if re.match('apertium-[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?', repo.name):
            yield repo.name

Looks like heavy function. But I don't see any improvements yet, except for having certain repo for all bidix copies. But this one above is the most up-to-date. It filters not languages pair repos, it is needed not to look for bidix where it can't be. Function saves a lot of time.

In [18]:
%time w = list(repo_names(user))

Wall time: 26.8 s


** Find bidix **

Length sorting to reduce number of files to check (bidix is lone of the longest)

In [8]:
def bidix_url(repo):
    for i in sorted(repo.get_dir_contents('/'), key = lambda x: (len(x.path), 1000-ord(('   '+x.path)[-3])), reverse=True):
        if re.match('apertium-.*?\.[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?.dix$', i.path):
            return i.download_url
        elif len(i.path) < 23:
            return None

In [27]:
%time bidix_url(github.get_repo(user.name+'/'+w[22]))

Wall time: 709 ms


'https://raw.githubusercontent.com/apertium/apertium-cat-srd/master/apertium-cat-srd.cat-srd.dix'

** Only relevant for certain language pair **

There are **164 ** pairs at this moment

In [11]:
def download_all_bidixes(user):
    logging.info('Start')
    if not os.path.exists('./dictionaries/'):
        os.makedirs('./dictionaries/')
    for repo_name in repo_names(user):
        bidix = bidix_url(github.get_repo(user.name+'/'+repo_name))
        langs = [l(i) for i in repo_name.split('-')[1:]]
        filename = './dictionaries/'+'-'.join(langs)+'.dix'
        if bidix:
            response = requests.get(bidix)
            response.encoding = 'UTF-8'
            with open(filename, 'w', encoding='UTF-8') as f:
                f.write(response.text)
    logging.info('Finish')

In [12]:
download_all_bidixes(user)

2018-05-22 15:26:56,733 | INFO : Start
2018-05-22 15:44:22,590 | INFO : Finish


In [13]:
def get_relevant_languages(l1, l2):
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    pair = [l(l1), l(l2)]
    with open('language_list.csv','w', encoding='utf-8') as f:
        nodes = set()
        for i in range(1,5):
            w = nx.single_source_shortest_path_length(G, pair[0], cutoff=i)
            v = nx.single_source_shortest_path_length(G, pair[1], cutoff=i)
            H = G.subgraph(w.keys())
            H.remove_node(pair[0])
            H2 = G.subgraph(v.keys())
            H2.remove_node(pair[1])
            if pair[1] in H.nodes():
                v = nx.node_connected_component(H, pair[1])
            else:
                v = set()
            if pair[0] in H2.nodes():
                w = nx.node_connected_component(H, pair[1])
            else:
                w = set() 
            nodes2 = v & w | set([pair[0], pair[1]])
            nodes2 = nodes2 - nodes
            for node in nodes2:
                f.write('{}\t{}\n'.format(i*2, node))
            nodes = nodes | nodes2

In [16]:
get_relevant_languages('bel', 'rus')

In [105]:
def check_languages():
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files:
            #print (root+fl)
            try:
                s = ET.parse(root+fl)
            except:
                print ('ERROR :'+fl)

In [106]:
check_languages()

ERROR :epo-bul.dix
ERROR :epo-per.dix
ERROR :epo-pol.dix
ERROR :fin-fra.dix
ERROR :pol-lav.dix
ERROR :sah-eng.dix


In [6]:
def existance(pair, nodes):
    if pair[0] in nodes and pair[1] in nodes:
        return True
    else:
        return False

def load_chosen():
    with open ('language_list.csv','r',encoding='utf-8') as f:
        languages = set([i.split('\t')[1].strip() for i in f.readlines()])
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files:
            pair = fl.replace('.dix','').split('-')
            if existance(pair, languages):
                try:
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        dictionary = d.read().replace('<b/>',' ').replace('<.?g>','')
                        yield ET.fromstring(dictionary), pair[0], pair[1]
                except:
                    print ('ERROR: ', fl)

In [7]:
%time len(list(load_chosen()))

ERROR:  epo-bul.dix
ERROR:  epo-per.dix
ERROR:  epo-pol.dix
ERROR:  fin-fra.dix
ERROR:  pol-lav.dix
ERROR:  sah-eng.dix
Wall time: 3min 47s


269

## Object classes

** Word **

- lemma : lemma
- lang : language
- pos : part of speech

In [7]:
class Word:
    def __init__(self, lemma, lang, s=[]):
        self.lemma = lemma
        self.lang = lang
        self.s = s
    
    def __str__(self):
        return str(self.lang)+'_'+str(self.lemma)+'_'+str('-'.join(self.s))
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and self.s == other.s
    
    def __hash__(self):
        return hash(str(self))

## Parsing

### Bidix parsing

In [11]:
%time T = tree('https://raw.githubusercontent.com/apertium/apertium-eng-ita/master/apertium-eng-ita.eng-ita.dix')

Wall time: 921 ms


In [8]:
def one_word(word, lang):
    s = word.findall('.//s')
    s = [i.attrib['n'] for i in s]
    return Word(word.text, lang, s)

In [9]:
def parse_bidix (tree, l1, l2):
    tree = tree.find('section')
    if not tree:
        print (l1, l2)
    else:
        for e in tree:
            if 'n' in e.attrib:
                side = e.attrib['n']
            else:
                side = None
            p = e.find('p')
            if p:
                yield one_word(p.find('l'), l1), one_word(p.find('r'), l2), side
            else:
                try:
                    i = e.find('i')
                    yield one_word(i, l1), one_word(i, l2), side
                except:
                    pass

In [147]:
% time len(list(parse_bidix (T, 'bel','rus')))

Wall time: 882 ms


48880

In [10]:
def add_bidix(G, T, l1, l2):
    for word1, word2, side in parse_bidix (T, l1, l2):
        if side == None:
            G.add_edge(word1, word2)
            G.add_edge(word2, word1)
        elif side == 'LR':
            G.add_edge(word1, word2)
        elif side == 'RL':
            G.add_edgr(word2, word1)
        else:
            print (side)

In [11]:
class SetWithFilter(set):
    def lemma(self, value):
        return set(i for i in self if i.lemma == value)
    #def pos(self, value):
    #    return set(i for i in self if i.pos == value)
    def lang(self, value):
        return set(i for i in self if i.lang == value)
    #def notlang(self, value):
    #    return set(i for i in self if i.lang != value)

In [12]:
def dictionaries(G, l1,l2):
    l1 = l(l1)
    l2 = l(l2)
    d_l1 = SetWithFilter()
    d_l2 = SetWithFilter()
    for i in G.nodes():
        if i.lang == l1:
            d_l1.add(i)
        elif i.lang == l2:
            d_l2.add(i)
    return d_l1, d_l2

In [13]:
def lemma_search (G, lemma, d_l1, l2, cutoff):
    lemmas = d_l1.lemma(lemma)
    print (G.degree(lemmas))
    print (lemmas)
    results = {str(word):{} for word in lemmas}
    for word in lemmas:
        print(word, end='\t')
        s = SetWithFilter(nx.single_source_shortest_path_length(G, word, cutoff=cutoff))
        print ('all: ', str(len(s)), end='\t')
        s = s.lang(l2)
        print ('filtered: ', str(len(s)))
        for translation in s:
            t = list(nx.all_simple_paths(G, word, translation, cutoff=cutoff))
            t = [len(i) for i in t]
            t = Counter(t)
            coef = 0
            for i in t:
                coef += exp(-t[i])
            results[str(word)][str(translation)] = coef
    return results

In [14]:
def print_results(results):
    for i in results:
        print ('\n\t\t', i)
        for j in sorted(results[i], key=results[i].get, reverse=True)[:7]:
            print (j, results[i][j])

### RUS-FRA

In [29]:
get_relevant_languages('rus', 'fra')

In [16]:
G = nx.DiGraph()
logging.info('Start')
for T, l1, l2 in load_chosen():
    add_bidix(G, T, l1, l2)
logging.info('Finish')

2018-05-22 21:46:47,006 | INFO : Start
ERROR:  epo-bul.dix
ERROR:  epo-fas.dix
ERROR:  epo-pol.dix
ERROR:  fin-fra.dix
lit lav
ERROR:  pol-lav.dix
ERROR:  sah-eng.dix
2018-05-22 21:52:49,343 | INFO : Finish


In [None]:
import pickle as pkl
with open('graph.pkl','wb') as f:
    pkl.dump(G, f)

In [None]:
import pickle as pkl
with open('graph.pkl','rb') as f:
    G = pkl.load(f)

In [18]:
d_l1, d_l2 = dictionaries(G, 'rus','fra')

In [37]:
def check_lemma (G, lemma, d_l1, l2):
    lemmas = d_l1.lemma(lemma)
    print (lemmas)
    results = {str(word):{} for word in lemmas}
    for word in lemmas:
        print (word)
        for cutoff in range(1, 8):
            print (cutoff, end='\t')
            s = SetWithFilter(nx.single_source_shortest_path_length(G, word, cutoff=cutoff))
            print ('all: ', str(len(s)), end='\t')
            s = s.lang(l2)
            print ('filtered: ', str(len(s)))
            if len(s)>150:
                break

In [19]:
%time print_results(lemma_search (G, 'собака', d_l1, 'fra', 4))

{rus_собака_n-f-aa: 22, rus_собака_n-f: 2, rus_собака_n: 6}
{rus_собака_n-f-aa, rus_собака_n-f, rus_собака_n}
rus_собака_n-f-aa	all:  2883	filtered:  37
rus_собака_n-f	all:  239	filtered:  3
rus_собака_n	all:  571	filtered:  15

		 rus_собака_n-f-aa
fra_chien_n-GD 0.4180019721672088
fra_chien_n 0.36821490379934485
fra_chien_n-m 0.3679248411012048
fra_docteur_n 0.36787944117144233
fra_docteur_n-GD 0.36787944117144233
fra_sport_n-m 0.36787944117144233
fra_compagnon_n 0.36787944117144233

		 rus_собака_n-f
fra_chien_n-GD 0.503214724408055
fra_chien_n 0.36879132313699686
fra_chien_n-m 0.36787944117144233

		 rus_собака_n
fra_chien_n-m 0.503214724408055
fra_chien_n 0.41767265375165963
fra_colimaçon_n-m 0.4176665095393063
fra_limaçon_n-m 0.4176665095393063
fra_chien_n-GD 0.4176665095393063
fra_arobace_n-f 0.36787944117144233
fra_but_n-m 0.36787944117144233
Wall time: 1.45 s


In [39]:
%time print_results(lemma_search (G, 'поле', d_l1, 'fra', 3))

{rus_поле_n: 6, rus_поле_n-nt: 2, rus_поле_n-nt-nn-pl: 2, rus_поле_n-nt-nn: 26}
{rus_поле_n, rus_поле_n-nt, rus_поле_n-nt-nn-pl, rus_поле_n-nt-nn}
rus_поле_n	all:  278	filtered:  7
rus_поле_n-nt	all:  144	filtered:  2
rus_поле_n-nt-nn-pl	all:  3	filtered:  0
rus_поле_n-nt-nn	all:  1504	filtered:  11

		 rus_поле_n
fra_camp_n-m 0.7357588823428847
fra_champ_n-m 0.7357588823428847
fra_marge_n-m 0.7357588823428847
fra_plantation_n-f 0.7357588823428847
fra_marge_n-f 0.36787944117144233
fra_rive_n-f 0.049787068367863944
fra_berge_n-f 0.049787068367863944

		 rus_поле_n-nt
fra_camp_n-m 0.36787944117144233
fra_champ_n-m 0.36787944117144233

		 rus_поле_n-nt-nn-pl

		 rus_поле_n-nt-nn
fra_camp_n-m 0.3746173881705278
fra_champ_n-m 0.3746173881705278
fra_cadre_n 0.36787944117144233
fra_camp_n 0.36787944117144233
fra_patinoire_n-f 0.36787944117144233
fra_champ_n 0.36787944117144233
fra_domaine_n-m 0.36787944117144233
Wall time: 689 ms


In [40]:
%time print_results(lemma_search (G, 'serpent', d_l2, 'rus', 4))

{fra_serpent_n-m: 18, fra_serpent_n: 4, fra_serpent_n-m-ND: 2}
{fra_serpent_n-m, fra_serpent_n, fra_serpent_n-m-ND}
fra_serpent_n-m	all:  725	filtered:  14
fra_serpent_n	all:  3	filtered:  0
fra_serpent_n-m-ND	all:  3	filtered:  0

		 fra_serpent_n-m
rus_змея_n 0.41768321124009655
rus_змей_n 0.41768321124009655
rus_змейка_n-f-nn 0.3861950800601765
rus_змея_n-f-aa 0.3746173889287838
rus_уж_n 0.3746173881705278
rus_змей_n-m-aa 0.36879132313699686
rus_гадюка_n-f-aa 0.36787944117144233

		 fra_serpent_n

		 fra_serpent_n-m-ND
Wall time: 623 ms


In [41]:
%time print_results(lemma_search (G, 'enfant', d_l2, 'rus', 4))

{fra_enfant_n-m: 10, fra_enfant_n: 8, fra_enfant_n-n: 2, fra_enfant_n-mf-pl: 8, fra_enfant_n-m-ND: 2, fra_enfant_n-mf: 30, fra_enfant_n-mf-ND: 4, fra_enfant_n-mf-sg: 2}
{fra_enfant_n-m, fra_enfant_n, fra_enfant_n-n, fra_enfant_n-mf-pl, fra_enfant_n-m-ND, fra_enfant_n-mf, fra_enfant_n-mf-ND, fra_enfant_n-mf-sg}
fra_enfant_n-m	all:  852	filtered:  19
fra_enfant_n	all:  782	filtered:  38
fra_enfant_n-n	all:  833	filtered:  17
fra_enfant_n-mf-pl	all:  45	filtered:  0
fra_enfant_n-m-ND	all:  32	filtered:  0
fra_enfant_n-mf	all:  6576	filtered:  88
fra_enfant_n-mf-ND	all:  586	filtered:  8
fra_enfant_n-mf-sg	all:  8	filtered:  0

		 fra_enfant_n-m
rus_мальчик_n-m-aa 0.3746173881705278
rus_девочка_n-f-aa 0.3746173881705278
rus_мадемуазель_n 0.36787944117144233
rus_барышня_n 0.36787944117144233
rus_паренек_n-m-aa 0.36787944117144233
rus_сын_n-m-aa 0.36787944117144233
rus_ребёнок_n 0.36787944117144233

		 fra_enfant_n
rus_мальчик_n 0.7357588823428847
rus_ребенок_n-m-aa 0.503214724408055
rus_мла

In [42]:
for i in G.nodes():
    if i.lemma == 'enfant':
        print (i)

fra_enfant_n-mf
fra_enfant_n-m
fra_enfant_n
fra_enfant_n-mf-ND
fra_enfant_n-m-ND
fra_enfant_n-mf-pl
fra_enfant_n-mf-sg
fra_enfant_n-n


### Word tags

In [95]:
for node in G.nodes():
    if len(node.s) > 6:
        print (node)

ben_prpers_prn-p3-infml-aa-mf-sg-gen
cym_rhywun_prn-tn-m-sg-tn-m-sg
kaz_сіз_prn-pers-p2-sg-frm-gen-subst-nom
eng_you're_prn-subj-p2-mf-sp-vbser-pres
sco_ye're_prn-subj-p2-mf-sp-vbser-pres
fin_sama_adj-pos-sg-ess-n-sg-ess
hbs_na_pr-acc-prn-pers-clt-p3-m-sg-acc
hin_वह_prn-dem-p3-mf-sg-dst-nom
hin_वह_prn-dem-p3-mf-pl-dst-nom
ita_Milà_np-cog-cog-cog-cog-mf-sp


In [95]:
d = {}
for i in d_l1:
    if i.lemma not in d:
        d[i.lemma] = set()
    d[i.lemma].add('_'.join(i.s))

In [100]:
a = [' | '.join(list(sorted(d[i]))) for i in d if len(d[i])>1]
print(len(a)/len(d_l1))
a = Counter(a)

0.17975050961623681


In [91]:
for i in sorted(a, key=a.get, reverse=True)[:20]:
    print (i, a[i])

n | n_m_nn 2841
n | n_f_nn 2549
n | n_m_aa 1178
adj_sint | n 1149
n | n_nt_nn 921
adj | adj_sint | n 586
adj | adj_sint 382
n | vblex_perf | vblex_perf_tv 353
n | vblex_perf_tv 344
n | n_f | n_f_nn 287
n | n_m | n_m_nn 265
vblex_perf | vblex_perf_tv 251
adv | n 236
n | vblex_impf 218
n | vblex_impf_tv 210
n | n_f_aa 197
n | vblex_impf | vblex_impf_tv 190
adj | n 183
vblex_impf | vblex_impf_tv 154
n_m_aa | n_m_nn 132


In [104]:
d = {}
for i in d_l2:
    if i.lemma not in d:
        d[i.lemma] = set()
    d[i.lemma].add('_'.join(i.s))

In [110]:
c = [d[i] for i in d if 'n' in d[i]]

In [115]:
len(c)/len([i.lemma for i in d_l2 if 'n' in i.s])

0.2980614543114543

In [102]:
a = [' | '.join(list(sorted(d[i]))) for i in d if len(d[i])>1]
print(len(a)/len(d_l2))
a = Counter(a)
for i in sorted(a, key=a.get, reverse=True)[:20]:
    print (i, a[i])

0.17001942380570761
n | n_f 2496
n | n_m 1651
np | np_cog_mf_sp 811
adj | n 583
adj | adj_mf 386
np | np_ant | np_ant_m_sg 330
np | np_cog 325
np | np_loc_f 311
np | np_ant | np_ant_f_sg 288
adj | n_m 257
np | np_ant 230
adj | adj_GD | adj_f | adj_m 193
adj | n | n_m 172
n_f | n_m 166
np | np_ant_m_sg 118
n | n_mf 117
np_ant_f_sg | np_cog_mf_sp 98
np | np_cog | np_cog_mf_sp 84
adj | n | n_mf 83
num | num_mf_sp 83


## Evaluation

- for every bidix
- get relevant
- try 4 max
- exclude pair (skip)
- for every word in actual bidix find best translation
- compare accuracy

In [17]:
for root, dirs, files in os.walk ('./dictionaries/'):
    for fl in files :
        pair = fl.replace('.dix', '').split('-')
        one_comparison(pair[0], pair[1])

arg cat 35
bel rus 35
chv rus 25
chv tat 25
chv tur 25
cos ita 36
eus fin 30
eus sme 30
grn spa 32
guc spa 32
kir uzb 21
kpv fin 27
krl olo 26
liv fin 26
mrj fin 26
myv fin 26
oci cat 36
oci fra 36
oci spa 36
olo fin 26
quz spa 32
rum ita 35
scn spa 32
udm kpv 36
udm rus 36
wel spa 32
zho spa 32


In [15]:
def relevant (l1, l2):
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    pair = [l(l1), l(l2)]
    nodes = set()
    i = 2
    w = nx.single_source_shortest_path_length(G, pair[0], cutoff=i)
    v = nx.single_source_shortest_path_length(G, pair[1], cutoff=i)
    H = G.subgraph(w.keys())
    H.remove_node(pair[0])
    H2 = G.subgraph(v.keys())
    H2.remove_node(pair[1])
    if pair[1] in H.nodes():
        v = nx.node_connected_component(H, pair[1])
    else:
        v = set()
    if pair[0] in H2.nodes():
        w = nx.node_connected_component(H, pair[1])
    else:
        w = set() 
    nodes = v & w | set([pair[0], pair[1]])
    #if len(nodes) > 20 and len(nodes) < 40:
    return nodes

In [None]:
def word compare(G, d1, d2, d1_t, d2_t):
    for word in d1:
        if word in G.nodes():
            

In [17]:
def one_comparison(l1_m, l2_m):
    logging.info('Start\t'+'{}\t{}'.format(l1_m, l2_m))
    nodes = relevant (l1_m, l2_m)
    #print (nodes)
    with open('language_list.csv','w', encoding='utf-8') as f:
        for node in nodes:
            f.write('{}\t{}\n'.format(4, node))
    G = nx.DiGraph()
    logging.info('Start loading')
    for T, l1, l2 in load_chosen():
        if not (l1 in [l1_m, l2_m] and l2 in [l1_m, l2_m]):
            add_bidix(G, T, l1, l2)
    d1_t, d2_t = dictionaries(G, l1_m, l2_m)
    logging.info('Finish loading')
    return G, d1_t, d2_t

In [None]:
del G

In [18]:
G, d1_t, d2_t = one_comparison('eng', 'rus')

2018-05-23 20:23:30,991 | INFO : Start	eng	rus
2018-05-23 20:23:31,003 | INFO : Start loading
ERROR:  epo-bul.dix
ERROR:  epo-fas.dix
ERROR:  epo-pol.dix
ERROR:  fin-fra.dix
lit lav
ERROR:  pol-lav.dix
ERROR:  sah-eng.dix
2018-05-23 20:27:33,874 | INFO : Finish loading


In [19]:
len(d1_t)

258131

In [21]:
len(d2_t)

92083

In [25]:
def exact_search (G, word, lang, cutoff):
    results = {}
    if word in G:
        s = SetWithFilter(nx.single_source_shortest_path_length(G, word, cutoff=cutoff))
        #print ('all: ', str(len(s)), end='\t')
        s = s.lang(lang)
        #print ('filtered: ', str(len(s)))
        for translation in s:
            t = list(nx.all_simple_paths(G, word, translation, cutoff=cutoff))
            t = [len(i) for i in t]
            t = Counter(t)
            coef = 0
            for i in t:
                coef += exp(-t[i])
            results[str(translation)] = coef
        if results:
            for j in sorted(results, key=results.get, reverse=True)[:7]:
                #return [word, j, results[j]]
                return [str(word), str(j), str(results[j])]
        else:
            None

In [23]:
exact_search (G, Word('школа','bel',['n']), 'rus', 4)

['bel_школа_n', 'rus_школа_n', '0.503214724408055']

In [26]:
with open ('test.csv', 'w', encoding='utf-8') as f:
    for i in d1_t:
        result = exact_search (G, i, 'rus', 4)
        if result:
            f.write('\t'.join(result)+'\n')

KeyboardInterrupt: 

## New

In [11]:
ET.parse('./dictionaries/arg-cat.dix')

<xml.etree.ElementTree.ElementTree at 0x1f6a9abe390>

In [11]:
def load_language(lang):
    dictionary = defaultdict(lambda: defaultdict(lambda: 0))
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix','').split('-')
            if lang in pair:
                print (fl)
                try:
                    t = ET.parse(root+fl)
                    for word1, word2, side in parse_bidix (t, pair[0], pair[1]):
                        #word1 = str(word1)
                        #word2 = str(word2)
                        if lang == pair[0]:
                            dictionary[word1.lemma]['-'.join(word1.s)] += 1
                        else:
                            dictionary[word2.lemma]['-'.join(word2.s)] += 1
                except:
                    pass
    return dictionary

In [76]:
Counter(['word'])

Counter({'word': 1})

In [20]:
%time dictionary = load_language('rus')

ava-rus.dix
bel-rus.dix
bul-rus.dix
ces-rus.dix
chv-rus.dix
epo-rus.dix
hbs-rus.dix
isl-rus.dix
kaz-rus.dix
kom-rus.dix
pol-rus.dix
rus-eng.dix
rus-ukr.dix
tat-rus.dix
udm-rus.dix
Wall time: 5.29 s


In [21]:
len(dictionary)

69366

In [22]:
k = 0
for i in dictionary:
    k += len(dictionary[i])
k

91724

In [23]:
dictionary['собака']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-f-aa': 11, 'n-f': 1, 'n': 3})

In [24]:
dictionary['кошка']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-f-aa': 11})

In [25]:
dictionary['дверь']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-f': 1, 'n': 1, 'n-f-nn': 5})

In [28]:
dictionary['идти']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'vblex-impf': 21, 'vblex-impf-iv': 9, 'n': 2, 'vblex-imperf': 4})

In [None]:
def get (w):
    for i in 

In [18]:
w = set()
for i in dictionary:
    w = w | set(dictionary[i].keys())

In [16]:
v = set()
for i in w:
    v = v | set(i.split('-'))

** SPA **

In [130]:
dictionary['capital']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-m': 17,
             'n-f': 18,
             'n-m-sg': 1,
             'n-m-pl': 1,
             'adj-mf': 7,
             'n': 9,
             'adj': 9,
             'adj-mf-sg': 4,
             'adj-mf-pl': 2,
             'n-f-sg': 5})

In [111]:
dictionary['man']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n': 145, 'vblex': 7, 'n-ND': 3, 'np': 1, 'n-sg': 1, '': 1})

In [117]:
dictionary['pen']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n': 81, 'vblex': 12, 'n-sg': 1})

In [126]:
word = 'gun'
beginning = ''
for key in sorted(dictionary[word]):
    #if re.match('^'+beginning, key):
    print(key, dictionary[word][key])

n 78
n-ND 1
n-sg 2
vblex 5


** FRA **

In [98]:
dictionary['porte']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-f': 12, 'n-f-ND': 1, 'n': 6})

In [99]:
dictionary['enfant']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-mf': 23,
             'n-m': 8,
             'np-ant': 1,
             'adj': 1,
             'n': 7,
             'n-mf-ND': 2,
             'n-m-ND': 1,
             'n-mf-pl': 6,
             'n-mf-sg': 1,
             'n-n': 1})

In [101]:
dictionary['coup']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-m': 60, 'n-m-ND': 3, 'n': 20, 'm': 1})

In [102]:
dictionary['jour']

defaultdict(<function __main__.load_language.<locals>.<lambda>.<locals>.<lambda>()>,
            {'n-m': 20, 'm-sg': 1, 'n': 17, 'n-m-ND': 2})

In [None]:
for i in dictionary:
    if len(dictionary[i]) > 5:
        print (i, ' | '.join(dictionary[i].keys()))