In [1]:
import logging
import sys
import os

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
import json
import re
from collections import Counter
from math import exp

In [2]:
#Testing

In [41]:
with open('1', 'w', encoding='utf-8') as f:
    f.write(str(Word('José','q',['1'])))

In [42]:
with open('1', 'r', encoding='utf-8') as f:
    s = f.read()
s

'q_José_[1]'

In [2]:
import networkx as nx
import xml.etree.ElementTree as ET

In [3]:
from collections import Counter, defaultdict
from math import exp

In [4]:
def enc (word):
    s = word.encode('utf-8')
    s = s.decode('utf-8')
    return s

In [5]:
class Word:
    def __init__(self, lemma, lang, s=[]):
        if lemma == None: self.lemma = ''
        else: self.lemma = enc(lemma)
        self.lang = lang
        self.s = s
        
    def __str__(self):
        if self.s:
            if isinstance(self.s[0],list):
                w = '['+'_'.join(['-'.join(i) for i in self.s])+']'
            else:
                w = '['+'-'.join(self.s)+']'
        else:
            w = '-'
        return str(self.lang)+'_'+str(self.lemma)+'_'+str(w)
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and (self.s == other.s or other.s in self.s or self.s in other.s)
    
    def __lt__(self, other):
        if self.lang == other.lang:
            if self.lemma == other.lemma:
                s1 = set(self.s)
                s2 = set(other.s)
                if (not s1 - s2) and (s1&s2==s1) and (s2 - s1):
                    return True
                else:
                    return False
        else:
            return False
    
    def __hash__(self):
        return hash(str(self))
    
    def write(self, mode='mono'):
        if mode == 'mono':
            return self.lemma + '\t' + '_'.join([str(i) for i in self.s])
        elif mode == 'bi':
            return self.lang + '\t' +  self.lemma + '\t' + '_'.join([str(i) for i in self.s])

### Languages

In [6]:
def all_languages():
    G = nx.Graph()
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix', '').split('-')
            G.add_edge(pair[0], pair[1])
    d = G.degree()
    d = sorted(d, key=d.get, reverse=True)
    #print (d)
    with open('languages','w',encoding='utf-8') as f:
        f.write('\t'.join(d))

In [6]:
%time all_languages()

Wall time: 5 ms


### Monodix

In [6]:
class Tags(list):
    def __le__(self, other):
        s1 = set(self)
        s2 = set(other)
        if not s1 - s2 and s1&s2==s1:
            return True
        else:
            return False
    
    def __lt__(self, other):
        s1 = set(self)
        s2 = set(other)
        if (not s1 - s2) and (s1&s2==s1) and (s2 - s1):
            return True
        else:
            return False
    
    def __eq__(self, other):
        if set(self) == set(other):
            return True
        else:
            return False
        
    def __str__(self):
        return '-'.join(self)
    
    __repr__ = __str__
    
    def __hash__(self):
        return hash(str(self))

In [7]:
class WordDict(dict):
    def lemma(self, lemma):
        self.lemma = lemma
        
class FilteredDict(dict):
    def set_lang(self, lang):
        self.lang = lang
    
    def lemma(self, lemma):
        return self[self.lang+'_'+lemma]
        
    def add(self, word):
        lemma = word.lang+'_'+word.lemma
        tags = Tags(word.s)
        if lemma in self:
            #pass
            if tags in self[lemma]:
                self[lemma][tags] += 1
            else:
                self[lemma][tags] = 1
        else:
            self[lemma] = WordDict()
            self[lemma].lemma(lemma)
            self[lemma][tags] = 1

In [8]:
def one_language_dict(lang):
    dictionary = FilteredDict()
    dictionary.set_lang(lang)
    for root, dirs, files in os.walk ('./dictionaries/'):
        for fl in files :
            pair = fl.replace('.dix','').split('-')
            if lang in pair:
                if lang == pair[0]: side = 'l'
                else: side = 'r'
                try:
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        t = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))     
                    for word in parse_one(t, side, lang):
                        #try:
                        #    #print (word)
                        dictionary.add(word)
                        #except:
                        #    print (word)
                except:
                    pass
    return dictionary

In [9]:
def one_word(word, lang):
    s = word.findall('.//s')
    s = [i.attrib['n'] for i in s]
    if word.text: st = str(word.text)
    else: st = ''
    return Word(st, lang, s)

def parse_one (tree, side, lang):
    tree = tree.find('section')
    #print (len(tree))
    for e in tree:
        p = e.find('p')
        if p:
            word = one_word(p.find(side), lang)
            yield word
        else:
            i = e.find('i')
            #print (i)
            if i:
                word = one_word(i, lang)
                yield word
            else:
                pass
            #print (e)

In [10]:
def shorten(word_dict, f):
    short = []
    for i in sorted(word_dict, key=lambda x: (word_dict[x], -len(x)), reverse=True):
        t = True
        for key, j in enumerate(short):
            if (j[0] < i) or (i < j[0]):
                short[key].append(i)
                t = False
                break
        if t:
            short.append([i])
    f.write(word_dict.lemma)
    f.write(word_dict.lemma[4:])
    word = word_dict.lemma[4:]
    return word, short

In [11]:
def dictionary_to_nodes(dictionary):
    with open ('1', 'w', encoding = 'utf-8') as f:
        for i in dictionary.keys():
            word, tags = shorten(dictionary[i], f)
            f.write(word+'\n')
            #print (word)
            if '_' in word:
                word = word.replace('_', ' ')
            for tag in tags:
                f.write(str(Word(word,'1',['']))+'\n')
                f.write(str(Word(word, dictionary.lang, Tags([i for i in tag if i])))+'\n')
                yield Word(word, dictionary.lang, Tags([i for i in tag if i]))

In [12]:
def monodix():
    if not os.path.exists('./monodix/'):
        os.makedirs('./monodix/')
    with open('languages','r', encoding='utf-8') as f:
        langs = f.read().split('\t')
    for lang in langs:
        #print (lang, end = '\t')
        dictionary = one_language_dict(lang)
        #with open ('1', 'w', encoding = 'utf-8') as f:
        #    for i in dictionary:
        #        f.write(i+'\n')
        with open ('./monodix/'+lang+'.dix', 'w', encoding = 'utf-16') as f:
            #f.write('José')
            for i in dictionary_to_nodes(dictionary):
                f.write (i.write(mode='mono')+'\n')
        logging.info(lang)

In [42]:
%time monodix()

2018-05-31 20:37:02,535 | INFO : eng
2018-05-31 20:37:15,077 | INFO : spa
2018-05-31 20:37:22,967 | INFO : fin
2018-05-31 20:37:34,538 | INFO : epo
2018-05-31 20:37:41,414 | INFO : rus
2018-05-31 20:37:48,238 | INFO : ita
2018-05-31 20:37:55,538 | INFO : fra
2018-05-31 20:37:58,382 | INFO : pol
2018-05-31 20:38:08,821 | INFO : cat
2018-05-31 20:38:12,289 | INFO : kaz
2018-05-31 20:38:14,272 | INFO : tur
2018-05-31 20:38:15,810 | INFO : ces
2018-05-31 20:38:20,463 | INFO : deu
2018-05-31 20:38:22,678 | INFO : por
2018-05-31 20:38:30,726 | INFO : sme
2018-05-31 20:38:33,662 | INFO : hin
2018-05-31 20:38:36,828 | INFO : swe
2018-05-31 20:38:37,033 | INFO : ina
2018-05-31 20:38:38,963 | INFO : hbs
2018-05-31 20:38:40,554 | INFO : tat
2018-05-31 20:38:41,757 | INFO : eus
2018-05-31 20:38:43,646 | INFO : nld
2018-05-31 20:38:45,976 | INFO : slv
2018-05-31 20:38:48,092 | INFO : ron
2018-05-31 20:38:49,040 | INFO : bul
2018-05-31 20:38:56,022 | INFO : nor
2018-05-31 20:38:56,904 | INFO : isl
2

In [13]:
'n'.split('-')

['n']

In [13]:
def import_mono(lang):
    dictionary = []
    with open ('./monodix/{}.dix'.format(lang), 'r', encoding='utf-16') as f:
        for line in f:
            string = line.split('\t')
            dictionary.append(Word(string[0], lang, [Tags(i.split('-')) for i in string[1].strip().split('_')]))
    return dictionary

In [14]:
def one_word(word, lang):
    s = word.findall('.//s')
    s = [i.attrib['n'] for i in s]
    if word.text: st = str(word.text)
    else: st = ''
    #st = st.encode('utf-8')
    #st = st.decode('utf-8')
    #print(st)
    return Word(st, lang, s)

def parse_bidix (tree, l1, l2):
    tree = tree.find('section')
    if not tree:
        pass
        #print (l1, l2)
    else:
        for e in tree:
            if 'n' in e.attrib:
                side = e.attrib['n']
            else:
                side = ''
            p = e.find('p')
            if p:
                yield one_word(p.find('l'), l1), one_word(p.find('r'), l2), side
            else:
                i = e.find('i')
                if i:
                    yield one_word(i, l1), one_word(i, l2), side

In [99]:
str(None)

'None'

In [15]:
def check (word1, word2, lang1, lang2):
    word1 = lang1[lang1.index(word1)]
    word2 = lang2[lang2.index(word2)]
    return word1, word2

In [18]:
def existance(pair, nodes):
    if pair[0] in nodes and pair[1] in nodes:
        return True
    else:
        return False

def load_file(l1, l2):
    with open ('language_list.csv','r',encoding='utf-8') as f:
        languages = set([i.split('\t')[1].strip() for i in f.readlines()])
    with open ('{}-{}'.format(l1, l2), 'w', encoding='utf-8') as f:
        for root, dirs, files in os.walk ('./dictionaries/'):
            for fl in files:
                #print (fl)
                pair = fl.replace('.dix','').split('-')
                #print(pair)
                if existance(pair, languages):
                    logging.info('{}-{} started'.format(pair[0], pair[1]))
                    lang1 = import_mono(pair[0])
                    lang2 = import_mono(pair[1])
                    with open (root+fl, 'r', encoding='utf-8') as d:
                        try:
                            tree = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))
                            for word1, word2, side in parse_bidix (tree, pair[0], pair[1]):
                                try:
                                    word1, word2 = check (word1, word2, lang1, lang2)
                                    string = str(side) + '\t' + word1.write(mode='bi') + '\t' + word2.write(mode='bi') + '\n'
                                    f.write(string)
                                except:
                                    if word1 not in lang1:
                                        print ('\t', word1, end='\t')
                                    elif word2 not in lang2:
                                        print ('\t', word2, end='\t')
                            print ()
                        except:
                            print ('ERROR: {}-{}'.format(pair[0], pair[1]))

In [None]:
%time load_file('rus', 'fra')

2018-05-31 22:18:54,426 | INFO : bel-epo started

2018-05-31 22:19:03,068 | INFO : bel-rus started

2018-05-31 22:29:56,036 | INFO : eng-deu started


## Testing

In [97]:
s = [Word('f','f',[Tags(['n','adj']),Tags(['v'])])]
Word('f','f',['v']) in s

True

In [122]:
isinstance(1, str)

False

In [121]:
print (isinstance(Word(1,1,Tags(['n','adj'])).lemma, str))

True


In [33]:
pair = ['eng', 'deu']
lang1 = import_mono(pair[0])
lang2 = import_mono(pair[1])
with open ('./dictionaries/eng-deu.dix', 'r', encoding='utf-8') as d:
    tree = ET.fromstring(d.read().replace('<b/>',' ').replace('<.?g>',''))
    for word1, word2, side in parse_bidix (tree, pair[0], pair[1]):
        word1, word2 = check (word1, word2, lang1, lang2)
        #print (word1, word2)
        string = str(side) + '\t' + word1.write(mode='bi') + '\t' + word2.write(mode='bi') + '\n'
        #print (string)

FileNotFoundError: [Errno 2] No such file or directory: './monodix/deu.dix'

In [35]:
Word ('José','eng',['np']) in lang1

True

In [34]:
%time lang1 = import_mono('eng')

Wall time: 1.49 s


In [43]:
lang1[0] in lang1

True

In [131]:
word = Word('а', 'bel', ['n'])

In [130]:
word in lang1

False

In [21]:
word.lemma

'а'

In [22]:
lang1[0].lemma = str(lang1[0].lemma)

In [23]:
lang1[1].lemma == 'аазіс'

True

In [38]:
lang1[0].lemma

'а'

In [21]:
print (word.lemma, lang1[0].lemma)

а ﻿а


In [81]:
type(lang1[0].s[0])

__main__.Tags

In [91]:
lang1[10].lemma

"аб'яктыў"

In [91]:
lang1[0].s == word.s

True

In [88]:
word.lang

'bel'

In [83]:
print (lang1[0].lemma, word.s)

[n] ['n']


In [76]:
set(lang1[0].s[0])

{'n'}

In [86]:
word.lemma

'а'

In [93]:
lang1[0].lemma.decode('utf-8')# == word.lemma

AttributeError: 'str' object has no attribute 'decode'

In [75]:
Word('а', 'bel', Tags(['n'])) in lang1

False