In [1]:
import logging
import sys
import os

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
import json
import re
from collections import Counter
from math import exp

In [2]:
import networkx as nx
import xml.etree.ElementTree as ET

In [3]:
from collections import Counter, defaultdict
from math import exp

In [4]:
def enc (word):
    s = word.encode('utf-8')
    s = s.decode('utf-8')
    return s

In [5]:
class Word:
    def __init__(self, lemma, lang, s=[]):
        if lemma == None: self.lemma = ''
        else: self.lemma = enc(lemma)
        self.lang = lang
        self.s = s
        
    def __str__(self):
        if self.s:
            if isinstance(self.s[0],list):
                w = '['+'_'.join(['-'.join(i) for i in self.s])+']'
            else:
                w = '['+'-'.join(self.s)+']'
        else:
            w = '-'
        return str(self.lang)+'$'+str(self.lemma)+'$'+str(w)
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and (self.s == other.s or other.s in self.s or self.s in other.s)
    
    def __lt__(self, other):
        if self.lang == other.lang:
            if self.lemma == other.lemma:
                s1 = set(self.s)
                s2 = set(other.s)
                if (not s1 - s2) and (s1&s2==s1) and (s2 - s1):
                    return True
                else:
                    return False
        else:
            return False
    
    def __hash__(self):
        return hash(str(self))
    
    def write(self, mode='mono'):
        if mode == 'mono':
            return self.lemma + '\t' + '$'.join([str(i) for i in self.s])
        elif mode == 'bi':
            return self.lang + '\t' +  self.lemma + '\t' + '$'.join([str(i) for i in self.s])

### Languages

In [8]:
def all_languages():
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    d = G.degree()
    d = sorted(d, key=d.get, reverse=True)
    #print (d)
    with open('languages','w',encoding='utf-8') as f:
        f.write('\t'.join(d))

In [6]:
%time all_languages()

Wall time: 5 ms


### Monodix

In [6]:
class Tags(list):
    def __le__(self, other):
        s1 = set(self)
        s2 = set(other)
        if not s1 - s2 and s1&s2==s1:
            return True
        else:
            return False
    
    def __lt__(self, other):
        s1 = set(self)
        s2 = set(other)
        if (not s1 - s2) and (s1&s2==s1) and (s2 - s1):
            return True
        else:
            return False
    
    def __eq__(self, other):
        if set(self) == set(other):
            return True
        else:
            return False
        
    def __str__(self):
        return '-'.join(self)
    
    __repr__ = __str__
    
    def __hash__(self):
        return hash(str(self))

In [7]:
class WordDict(dict):
    def lemma(self, lemma):
        self.lemma = lemma
        
class FilteredDict(dict):
    def set_lang(self, lang):
        self.lang = lang
    
    def lemma(self, lemma):
        return self[self.lang+'_'+lemma]
        
    def add(self, word):
        lemma = word.lang+'_'+word.lemma
        tags = Tags(word.s)
        if lemma in self:
            if tags in self[lemma]:
                self[lemma][tags] += 1
            else:
                self[lemma][tags] = 1
        else:
            self[lemma] = WordDict()
            self[lemma].lemma(lemma)
            self[lemma][tags] = 1

In [8]:
def one_language_dict(lang):
    dictionary = FilteredDict()
    dictionary.set_lang(lang)
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix','').split('-')
            if lang in pair:
                if lang == pair[0]: side = 'l'
                else: side = 'r'
                try:
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        t = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))     
                    for word in parse_one(t, side, lang):
                        dictionary.add(word)
                except:
                    pass
    return dictionary

In [57]:
dictionary = one_language_dict('afr')

In [73]:
dictionary['afr_almal']

{prn-tn-m: 1, prn-tn-mf: 2, det-ind: 1, prn: 1}

In [76]:
shorten(dictionary['afr_almal'])

('almal', [[prn-tn-mf, prn], [det-ind], [prn-tn-m]])

In [9]:
def shorten(word_dict):
    short = []
    for i in sorted(word_dict, key=lambda x: (word_dict[x], -len(x)), reverse=True):
        new = True
        for key, j in enumerate(short):
            inner = True
            for key2, k in enumerate(j):
                if (k < i) or (i < k): pass
                else: inner = False
            if inner: 
                short[key].append(i)
                new = False
        if new: short.append([i])
    word = word_dict.lemma[4:]
    return word, short

In [10]:
def one_word(word, lang):
    if word.text: st = str(word.text)
    else: st = ''
    s = [i.attrib['n'] for i in word.findall('.//s')]
    s = [i for i in s if i != '']
    return Word(st, lang, s)

def parse_one (tree, side, lang):
    tree = tree.find('section')
    for e in tree:
        p = e.find('p')
        if p:
            word = one_word(p.find(side), lang)
            yield word
        else:
            i = e.find('i')
            if i:
                word = one_word(i, lang)
                yield word
            else:
                pass

In [12]:
def dictionary_to_nodes(dictionary):
    for i in dictionary.keys():
        word, tags = shorten(dictionary[i])
        if '_' in word:
            word = word.replace('_', ' ')
        for tag in tags:
            yield Word(word, dictionary.lang, Tags([i for i in tag if i != '']))

In [11]:
def monodix():
    if not os.path.exists('./monodix/'):
        os.makedirs('./monodix/')
    with open('languages','r', encoding='utf-8') as f:
        langs = f.read().split('\t')
    for lang in langs:
        dictionary = one_language_dict(lang)
        with open ('./monodix/'+lang+'.dix', 'w', encoding = 'utf-16') as f:
            for i in dictionary_to_nodes(dictionary):
                f.write (i.write(mode='mono')+'\n')
        logging.info(lang)

In [78]:
%time monodix()

2018-06-01 23:06:34,036 | INFO : eng
2018-06-01 23:06:45,989 | INFO : spa
2018-06-01 23:06:52,225 | INFO : fin
2018-06-01 23:07:01,691 | INFO : epo
2018-06-01 23:07:07,187 | INFO : rus
2018-06-01 23:07:13,218 | INFO : ita
2018-06-01 23:07:19,605 | INFO : fra
2018-06-01 23:07:21,951 | INFO : pol
2018-06-01 23:07:31,397 | INFO : cat
2018-06-01 23:07:34,256 | INFO : kaz
2018-06-01 23:07:35,907 | INFO : tur
2018-06-01 23:07:37,329 | INFO : ces
2018-06-01 23:07:41,079 | INFO : deu
2018-06-01 23:07:43,115 | INFO : por
2018-06-01 23:07:49,748 | INFO : sme
2018-06-01 23:07:52,042 | INFO : hin
2018-06-01 23:07:54,674 | INFO : swe
2018-06-01 23:07:54,839 | INFO : ina
2018-06-01 23:07:56,438 | INFO : hbs
2018-06-01 23:07:57,785 | INFO : tat
2018-06-01 23:07:58,892 | INFO : eus
2018-06-01 23:08:00,237 | INFO : nld
2018-06-01 23:08:02,053 | INFO : slv
2018-06-01 23:08:04,039 | INFO : ron
2018-06-01 23:08:04,767 | INFO : bul
2018-06-01 23:08:10,962 | INFO : nor
2018-06-01 23:08:11,746 | INFO : isl
2

In [13]:
'n'.split('-')

['n']

In [13]:
class DiGetItem:
    def __init__(self):
        self.list = []
        self.dict = {}
    
    def add(self, word):
        if len (word.s) > 1:
            self.list.append(word)
        else:
            self.dict[word] = word
    
    def __getitem__(self, key):
        key2 = Word(key.lemma, key.lang, [''])
        if key in self.dict:
            return self.dict[key]
        else:
            if key2 in self.dict:
                return self.dict[key2]
            try:
                key = self.list[self.list.index(key)]
                return key
            except:
                print (key)

In [25]:
a = [[],[1]]
key = a[a.index([])]
key

[]

In [14]:
def import_mono(lang):
    dictionary = DiGetItem()
    with open ('./monodix/{}.dix'.format(lang), 'r', encoding='utf-16') as f:
        for line in f:
            string = line.strip('\n').split('\t')
            s = [Tags([j for j in i.split('-') if j !='']) for i in string[1].strip().split('$')]
            dictionary.add(Word(string[0], lang, s))
    return dictionary

In [42]:
epo = import_mono('ukr')

<__main__.DiGetItem at 0x1a508fb3978>

In [None]:
for i in epo.dict:
    if i.s == [['adj']]:
        print(i)

In [16]:
if '':
    print ('1')

In [17]:
'' in Tags([])

False

In [None]:
for i in epo.dict:
    if ['adj','sint'] in i.s:
        print (list(epo.dict[i].s))

In [15]:
def one_word(word, lang):
    s = word.findall('.//s')
    s = [i.attrib['n'] for i in s]
    if word.text: st = str(word.text)
    else: st = ''
    #s = Tags([i for i in s if i != ''])
    s = Tags(s)
    if '_' in st:
        st = st.replace('_',' ')
    return Word(st, lang, s)

def parse_bidix (tree, l1, l2):
    tree = tree.find('section')
    if not tree:
        pass
        #print (l1, l2)
    else:
        for e in tree:
            if 'n' in e.attrib:
                side = e.attrib['n']
            else:
                side = ''
            p = e.find('p')
            if p:
                yield one_word(p.find('l'), l1), one_word(p.find('r'), l2), side
            else:
                i = e.find('i')
                if i:
                    yield one_word(i, l1), one_word(i, l2), side

In [23]:
str(None)

'None'

In [16]:
def check (word1, word2, lang1, lang2):
    #word1 = lang1[lang1.index(word1)]
    #word2 = lang2[lang2.index(word2)]
    word1 = lang1[word1]
    word2 = lang2[word2]
    return word1, word2

In [17]:
def existance(pair, nodes):
    if pair[0] in nodes and pair[1] in nodes:
        return True
    else:
        return False

def load_file(l1, l2):
    with open ('language_list.csv','r',encoding='utf-8') as f:
        languages = set([i.split('\t')[1].strip() for i in f.readlines()])
    with open ('{}-{}'.format(l1, l2), 'w', encoding='utf-16') as f:
        for root, dirs, files in os.walk ('./dictionaries/'):
            for fl in files:
                pair = fl.replace('.dix','').split('-')
                if existance(pair, languages):
                    logging.info('{}-{} started'.format(pair[0], pair[1]))
                    lang1 = import_mono(pair[0])
                    lang2 = import_mono(pair[1])
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        try:
                            tree = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))
                            for word1, word2, side in parse_bidix (tree, pair[0], pair[1]):
                                try:
                                    word1, word2 = check (word1, word2, lang1, lang2)
                                    string = str(side) + '\t' + word1.write(mode='bi') + '\t' + word2.write(mode='bi') + '\n'
                                    f.write(string)
                                except:
                                    pass
                        except:
                            print ('ERROR: {}-{}'.format(pair[0], pair[1]))

In [19]:
s = 'eng$$-'
s = s.split('$')[-1]
s = [i for i in s.split('-') if i !='']
s

[]

In [37]:
'-'.join(['',''])

'-'

In [21]:
eng = import_mono('eng')

In [22]:
Word('general high school','eng',[''])

eng$general high school$[]

In [23]:
eng[Word('general high school','eng',[])]

eng$general high school$[]

In [25]:
afr = import_mono('afr')

In [26]:
afr[Word('self','afr',['prn'])]

afr$self$[prn_prn-ref]

In [50]:
key = Word('ryksgebied','afr',['n'])

In [51]:
key2 = Word(key.lemma, key.lang, [''])
if key in afr.dict:
    print ('return: ', afr.dict[key])
else:
    if key2 in afr.dict:
        print ('return: ', afr.dict[key2])
    try:
        key = afr.list[afr.list.index(key)]
        print ('return: ', key)
    except:
        print (key)

return:  afr$ryksgebied$[n]


In [80]:
%time load_file('rus', 'fra')

2018-06-01 23:09:18,466 | INFO : afr-nld started
2018-06-01 23:09:19,685 | INFO : asm-ben started
2018-06-01 23:09:20,029 | INFO : asm-eng started
2018-06-01 23:09:22,859 | INFO : asm-hin started
2018-06-01 23:09:23,564 | INFO : bel-epo started
2018-06-01 23:09:26,323 | INFO : bel-rus started
2018-06-01 23:09:55,960 | INFO : ben-eng started
2018-06-01 23:10:00,493 | INFO : ben-hin started
2018-06-01 23:10:01,228 | INFO : bre-cym started
cym$ffôn$[n-]
2018-06-01 23:10:01,604 | INFO : bre-fra started
2018-06-01 23:10:17,605 | INFO : bre-spa started
2018-06-01 23:10:27,003 | INFO : bul-ell started
2018-06-01 23:10:27,242 | INFO : bul-eng started
2018-06-01 23:10:36,612 | INFO : bul-rus started
2018-06-01 23:10:40,561 | INFO : cat-glg started
2018-06-01 23:11:40,383 | INFO : cat-ina started
2018-06-01 23:11:41,661 | INFO : cat-ita started
2018-06-01 23:12:14,963 | INFO : ces-ces started
ces$abê$[cnjsub]
ces$dêž$[cnjsub]
ces$bê$[cnjsub]
ces$abê$[cnjsub]
ces$dêbê$[cnjsub]
ces$dokáď$[cnjsub]


In [None]:
lang1.dict

In [None]:
fl = './dictionaries/afr-nld.dix'
pair = ['afr','nld']
with open ('language_list.csv','r',encoding='utf-8') as f:
        languages = set([i.split('\t')[1].strip() for i in f.readlines()])
if existance(pair, languages):
    logging.info('{}-{} started'.format(pair[0], pair[1]))
    lang1 = import_mono(pair[0])
    lang2 = import_mono(pair[1])
    with open (fl, 'r', encoding='utf-8') as d:
        try:
            tree = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))
            for word1, word2, side in parse_bidix (tree, pair[0], pair[1]):
                try:
                    word1, word2 = check (word1, word2, lang1, lang2)
                    string = str(side) + '\t' + word1.write(mode='bi') + '\t' + word2.write(mode='bi') + '\n'
                    #f.write(string)
                except:
                    pass
        except:
            print ('ERROR: {}-{}'.format(pair[0], pair[1]))

## Reading from file

In [18]:
def change_encoding(file):
    with open(file, 'r', encoding='utf-16') as f:
        text = f.read()
    text = text.encode('utf-8')
    text = text.decode('utf-8')
    with open(file, 'w', encoding='utf-8') as f:
        f.write(text)

In [19]:
change_encoding('rus-fra')

In [20]:
def parse_line(line):
    side, lang1, lemma1, tags1, lang2, lemma2, tags2 = line.strip('\n').split('\t')
    tags1 = [Tags(i.split('-')) for i in tags1.split('$')]
    tags2 = [Tags(i.split('-')) for i in tags2.split('$')]
    return side, Word(lemma1, lang1, tags1), Word(lemma2, lang2, tags2)

In [21]:
def nodes_from_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            yield parse_line(line)

# Loading graph

In [22]:
def built_from_file(file):
    G = nx.DiGraph()
    for side, word1, word2 in nodes_from_file(file):
        if not side:
            G.add_edge(word1, word2)
            G.add_edge(word2, word1)
        elif side == 'LR':
            G.add_edge(word1, word2)
        elif side == 'RL':
            G.add_edgr(word2, word1)
        else:
            print (side)
    return G

In [120]:
%time G = built_from_file('rus-fra')

Wall time: 38.6 s


In [121]:
len(G.nodes())

474441

In [126]:
for i in G.nodes():
    if i.lemma == 'José':
        print (i)

eng$José$[np-ant_np_np-ant-m-sg_np-ant-m]
deu$José$[np]
ita$José$[np-ant_np-ant-m_np-ant-m-sg]
por$José$[np_np-ant]
fra$José$[np_np-ant_np-ant-m-sg]
epo$José$[n]
pol$José$[np-ant-mp]


In [125]:
len(G.edges())

804608

## Search (changed)

In [23]:
class SetWithFilter(set):
    def lemma(self, value):
        return set(i for i in self if i.lemma == value)
    def lang(self, value):
        return set(i for i in self if i.lang == value)

In [24]:
def dictionaries(lang1,lang2):
    l1 = import_mono(lang1)
    l2 = import_mono(lang2)
    l1 = SetWithFilter(l1.list+list(l1.dict.keys()))
    l2 = SetWithFilter(l2.list+list(l2.dict.keys()))
    return l1, l2

In [25]:
def lemma_search (G, lemma, d_l1, l2, cutoff):
    lemmas = d_l1.lemma(lemma)
    results = {str(word):{} for word in lemmas}
    for word in lemmas:
        print(word, end='\t')
        s = SetWithFilter(nx.single_source_shortest_path_length(G, word, cutoff=cutoff))
        print ('all: ', str(len(s)), end='\t')
        s = s.lang(l2)
        print ('filtered: ', str(len(s)))
        for translation in s:
            t = list(nx.all_simple_paths(G, word, translation, cutoff=cutoff))
            t = [len(i) for i in t]
            t = Counter(t)
            coef = 0
            for i in t:
                coef += exp(-t[i])
            results[str(word)][str(translation)] = coef
    return results

In [26]:
def print_results(results, n=7):
    for i in results:
        print ('\n\t\t', i)
        for j in sorted(results[i], key=results[i].get, reverse=True)[:n]:
            print (j, results[i][j])

In [172]:
%time l1, l2 = dictionaries('rus', 'fra')

Wall time: 1.8 s


In [189]:
%time print_results(lemma_search (G, 'кот', l1, 'fra', 6))

rus$кот$[n-m-aa]	all:  53	filtered:  8

		 rus$кот$[n-m-aa]
fra$chat$[n-GD] 1.2389736067509398
fra$chat$[n-f] 1.2389736067509398
fra$chat mâle$[n-m] 0.36787944117144233
fra$matou$[n-m] 0.36787944117144233
fra$chat$[n-m_n_n-m-ND] 0.17196656101408103
fra$salon$[n-m_n_n-m-ND_n-m-sg] 0.01865110151663669
fra$bavardage$[n-m_n_n-m-ND] 0.018321783101087508
Wall time: 72.9 ms


## Full RUS-FRA (on many languages)

In [27]:
def get_relevant_languages(l1, l2):
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    pair = [l1, l2]
    
    with open('languages','r', encoding='utf-8') as f:
        languages = f.read().split('\t')
        
    with open('language_list.csv','w', encoding='utf-8') as f:
        nodes = set()
        for i in range(1,5):
            w = nx.single_source_shortest_path_length(G, pair[0], cutoff=i)
            v = nx.single_source_shortest_path_length(G, pair[1], cutoff=i)
            H = G.subgraph(w.keys())
            H.remove_node(pair[0])
            H2 = G.subgraph(v.keys())
            H2.remove_node(pair[1])
            if pair[1] in H.nodes():
                v = nx.node_connected_component(H, pair[1])
            else:
                v = set()
            if pair[0] in H2.nodes():
                w = nx.node_connected_component(H, pair[1])
            else:
                w = set() 
            nodes2 = v & w | set([pair[0], pair[1]])
            nodes2 = nodes2 - nodes
            #for node in nodes2:
            #    f.write('{}\t{}\n'.format(i*2, node))
            for lang in languages:
                if lang in nodes2:
                    f.write('{}\t{}\n'.format(i*2, lang))
            nodes = nodes | nodes2

In [202]:
get_relevant_languages('rus', 'fra')

** Loading file **

In [28]:
%time G = built_from_file('rus-fra')

Wall time: 5min 29s


In [29]:
print (len(G.nodes()), len(G.edges()))

1861933 4164055


In [30]:
for i in G.nodes():
    if i.lemma == 'книга':
        print (i)

rus$книга$[n-f-nn_n_n-f]
bul$книга$[n-f]
mkd$книга$[n-f_n]
kpv$книга$[n]
ukr$книга$[n-f_n-f-nn]
udm$книга$[n]


In [32]:
%time l1, l2 = dictionaries('rus', 'fra')

Wall time: 6min 17s


In [34]:
%time print_results(lemma_search (G, 'собака', l1, 'fra', 4), 10)

rus$собака$[n-f-aa_n_n-f]	all:  3327	filtered:  51

		 rus$собака$[n-f-aa_n_n-f]
fra$arobace$[n-f] 0.7357588823428847
fra$escargot$[n-m_n] 0.42440445653839176
fra$étau$[n-m_n] 0.4176665095393063
fra$chien$[n-GD] 0.374617429569905
fra$limaçon$[n-m] 0.3746173881705278
fra$singe$[n-m_n] 0.3746173881705278
fra$but$[n-m_n_n-m-ND] 0.3703581933481087
fra$goal$[n_n-m] 0.36787944117144233
fra$post$[n-m_n_n-m-ND] 0.36787944117144233
fra$ko$[n-acr-m] 0.36787944117144233
Wall time: 1.1 s
