In [3]:
import logging
import sys
import os

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
import json
import re
from collections import Counter
from math import exp

In [4]:
import networkx as nx
import xml.etree.ElementTree as ET

In [5]:
from collections import Counter, defaultdict
from math import exp

In [6]:
class Word:
    def __init__(self, lemma, lang, s=[]):
        self.lemma = lemma
        self.lang = lang
        self.s = s
        
    def __str__(self):
        if self.s:
            if isinstance(self.s[0],list):
                w = '['+'|'.join(['-'.join(i) for i in self.s])+']'
            else:
                w = '['+'-'.join(self.s)+']'
        else:
            w = '-'
        return str(self.lang)+'_'+str(self.lemma)+'_'+str(w)
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and (self.s == other.s or other.s in self.s or self.s in other.s)
    
    def __lt__(self, other):
        if self.lang == other.lang:
            if self.lemma == other.lemma:
                s1 = set(self.s)
                s2 = set(other.s)
                if (not s1 - s2) and (s1&s2==s1) and (s2 - s1):
                    return True
                else:
                    return False
        else:
            return False
    
    def __hash__(self):
        return hash(str(self))
    
    def write(self, mode='mono'):
        if mode=='mono':
            return self.lemma + '\t' + '_'.join(self.s)
        elif mode == 'bi':
            return self.lang + '\t' +  self.lemma + '\t' + '_'.join(self.s)

### Languages

In [5]:
def all_languages():
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    d = G.degree()
    d = sorted(d, key=d.get, reverse=True)
    #print (d)
    with open('languages','w',encoding='utf-8') as f:
        f.write('\t'.join(d))

In [7]:
%time all_languages()

Wall time: 4.98 ms


### Monodix

In [7]:
class Tags(list):
    def __le__(self, other):
        s1 = set(self)
        s2 = set(other)
        if not s1 - s2 and s1&s2==s1:
            return True
        else:
            return False
    
    def __lt__(self, other):
        s1 = set(self)
        s2 = set(other)
        if (not s1 - s2) and (s1&s2==s1) and (s2 - s1):
            return True
        else:
            return False
        
    def __str__(self):
        return '-'.join(self)
    
    __repr__ = __str__
    
    def __hash__(self):
        return hash(str(self))

In [9]:
Tags(['1','2','3'])

1-2-3

In [8]:
class FilteredDict(dict):
    def set_lang(self, lang):
        self.lang = lang
    
    def lemma(self, lemma):
        return self[self.lang+'_'+lemma]
        
    def add(self, word):
        lemma = word.lang+'_'+word.lemma
        tags = Tags(word.s)
        if lemma in self:
            if tags in self[lemma]:
                self[lemma][tags] += 1
            else:
                self[lemma][tags] = 1
        else:
            self[lemma] = WordDict()
            self[lemma].lemma(lemma)
            self[lemma][tags] = 1

In [13]:
def one_language_dict(lang):
    dictionary = FilteredDict()
    dictionary.set_lang(lang)
    print (dictionary, dictionary.lang)
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix','').split('-')
            if lang in pair:
                if lang == pair[0]: side = 'l'
                else: side = 'r'
                print (root+fl)
                try:
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        t = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))     
                    print (t)
                    for word in parse_one(t, side, lang):
                        print (word)
                        dictionary.add(word)
                except:
                    pass
    print (dictionary)
    return dictionary

In [10]:
def shorten(word_dict):
    short = []
    for i in sorted(word_dict, key=lambda x: (word_dict[x], -len(x)), reverse=True):
        t = True
        for key, j in enumerate(short):
            if (j[0] < i) or (i < j[0]):
                short[key].append(i)
                t = False
                break
        if t:
            short.append([i])
    return word_dict.lemma, short

In [19]:
def one_word(word, lang):
    s = word.findall('.//s')
    s = Tags([i.attrib['n'] for i in s])
    return Word(word.text, lang, s)

def parse_one (tree, side, lang):
    tree = tree.find('section')
    print (len(tree))
    for e in tree:
        p = e.find('p')
        if p:
            word = one_word(p.find(side), lang)
            yield word
        else:
            i = e.find('i')
            if i:
                word = one_word(i, lang)
                yield word

In [None]:
with open ('./dictionaries/bel-rus.dix', 'r', encoding='utf-8') as d:
    t = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))
for word in parse_one(t, 'r', 'rus'):
    print (word)

In [None]:
t = ET.parse('./dictionaries/bel-rus.dix')#.find('section'):
list(parse_one(t, 'r', 'rus'))

In [11]:
def dictionary_to_nodes(dictionary, file):
    for i in list(dictionary.keys())[:10]:
        print (i)
        lang, word = i.split('_')
        tags = [list(j) for j in dictionary[i]]
        if len(tags) == 1:
            tags = tags[0]
        yield Word(word, lang, tags)

In [20]:
rus = one_language_dict('urd')

{} urd
./dictionaries/urd-hin.dix
<Element 'dictionary' at 0x0000022F4D97FA48>
4943
<Element 'p' at 0x0000022F4DA3C278>
./dictionaries/urd-snd.dix
<Element 'dictionary' at 0x0000022F4D97F818>
1549
<Element 'p' at 0x0000022F4D900E58>
{}


In [16]:
rus

{}

In [36]:
w = list(dictionary_to_nodes(rus, 'file'))[0]

IndexError: list index out of range

In [50]:
w

<generator object dictionary_to_nodes at 0x000001CD6D0AC360>

In [40]:
def monodix():
    #if not os.path.exists('./monodix/'):
    #    os.makedirs('./monodix/')
    with open('languages','r', encoding='utf-8') as f:
        langs = f.read().split('\t')[:5]
    for lang in langs:
        print (lang, end = '\t')
        %time dictionary = one_language_dict(lang)

In [41]:
monodix()

eng	Wall time: 10.3 s
spa	Wall time: 4.96 s
fin	Wall time: 2.12 s
epo	Wall time: 3.34 s
rus	Wall time: 2.23 s
