In [3]:
import logging, sys, os, requests, json, re
from collections import Counter
from math import exp, log10
from itertools import islice
import networkx as nx
import xml.etree.ElementTree as ET
from github import Github
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',level=logging.INFO, stream=sys.stdout)
from itertools import islice
import matplotlib.pyplot as plt
from heapdict import heapdict
#from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
import random
import numpy as np, scipy.stats as st

In [1]:
class Word:
    def __init__(self, lemma, lang, s=[]):
        if lemma == None: self.lemma = ''
        else: self.lemma = lemma
        self.lang = lang
        self.s = s
        
    def __str__(self):
        if self.s:
            if isinstance(self.s[0],list): w = '['+'_'.join(['-'.join(i) for i in self.s])+']'
            else: w = '['+'-'.join(self.s)+']'
        else: w = '-'
        return str(self.lang)+'$'+str(self.lemma)+'$'+str(w)
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and (self.s == other.s or other.s in self.s or self.s in other.s)
    
    def __lt__(self, other):
        if self.lang == other.lang and self.lemma == other.lemma:
            s1 = set(self.s)
            s2 = set(other.s)
            if (not s1 - s2) and (s1&s2==s1) and (s2 - s1): return True
            else: return False
        else: return False
    
    def __hash__(self): return hash(str(self))
    
    def write(self, mode='mono'):
        "Mono: format to write in monodix, bi: format to write in bidix"
        if mode == 'mono': return self.lemma + '\t' + '$'.join([str(i) for i in self.s])
        elif mode == 'bi': return self.lang + '\t' +  self.lemma + '\t' + '$'.join([str(i) for i in self.s])
              
class Tags(list):
    def __le__(self, other):
        s1 = set(self)
        s2 = set(other)
        if not s1 - s2 and s1&s2==s1: return True
        else: return False
    
    def __lt__(self, other):
        s1 = set(self)
        s2 = set(other)
        if (not s1 - s2) and (s1&s2==s1) and (s2 - s1): return True
        else: return False
    
    def __eq__(self, other):
        if set(self) == set(other): return True
        else: return False
        
    def __str__(self): return '-'.join(self)
    
    __repr__ = __str__
    
    def __hash__(self): return hash(str(self))
    
class WordDict(dict):
    def lemma(self, lemma): self.lemma = lemma
        
class FilteredDict(dict):
    def set_lang(self, lang): self.lang = lang
    
    def lemma(self, lemma): return self[self.lang+'_'+lemma]
        
    def add(self, word):
        lemma = word.lang+'_'+word.lemma
        tags = Tags(word.s)
        if lemma in self:
            if tags in self[lemma]: self[lemma][tags] += 1
            else: self[lemma][tags] = 1
        else:
            self[lemma] = WordDict()
            self[lemma].lemma(lemma)
            self[lemma][tags] = 1
            
class DiGetItem:
    def __init__(self):
        self.list = []
        self.dict = {}
    
    def add(self, word):
        if len (word.s) > 1: self.list.append(word)
        else: self.dict[word] = word
    
    def __getitem__(self, key):
        key2 = Word(key.lemma, key.lang, [''])
        if key in self.dict: return self.dict[key]
        else:
            if key2 in self.dict: return self.dict[key2]
            try:
                key = self.list[self.list.index(key)]
                return key
            except:
                pass
    
    def __len__(self): return len(self.list)+len(self.dict)
                
class SetWithFilter(set):
    def lemma(self, value): return set(i for i in self if i.lemma == value)
    def lang(self, value): return set(i for i in self if i.lang == value)

class FilteredList(list):
    def lemma(self, value): return list(i for i in self if i.lemma == value)
    def lang(self, value): return list(i for i in self if i.lang == value)

# Loading

In [1]:
import logging, sys, os, re, requests
from github import Github
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',level=logging.INFO, stream=sys.stdout)
from tool.data import lang_codes

In [6]:
download()

2018-07-14 09:06:36,240 | INFO : Start
2018-07-14 09:17:23,238 | INFO : Finish


In [2]:
def l(lang, mode=3):
    "Language code converter"
    if lang in lang_codes: return lang_codes[lang]
    else: return lang

def repo_names(user):
    "List of language pair repos in Apertium"
    for repo in user.get_repos():
        if re.match('apertium-[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?', repo.name):
            yield repo.name

def bidix_url(repo):
    "Find raw url for bidix. Sorting in order to find bidix faster as it is one of the longest filename in repo"
    for i in sorted(repo.get_dir_contents('/'), key = lambda x: (len(x.path), 1000-ord(('   '+x.path)[-3])), reverse=True):
        if re.match('apertium-.*?\.[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?.dix$', i.path): return i.download_url
        elif len(i.path) < 23: return None
        
def download():
    from tool.secure import SECRET
    github = Github(SECRET['USER'], SECRET['PASSWORD'])  #import username and password
    user = github.get_user('apertium')
    
    logging.info('Start')
    if not os.path.exists('./dictionaries/'): os.makedirs('./dictionaries/')
    for repo_name in repo_names(user):
        bidix = bidix_url(github.get_repo(user.name+'/'+repo_name))
        if bidix:
            filename = './dictionaries/'+bidix.split('/')[-1]
            response = requests.get(bidix)
            response.encoding = 'UTF-8'
            with open(filename, 'w', encoding='UTF-8') as f: f.write(response.text)
    logging.info('Finish')  

def set_github_user(user, password):
    with open ('./tool/secure.py', 'w', encoding='utf-8') as f:
        f.write('SECRET = {"USER": "'+user+'", "PASSWORD": "'+password+'"}')

def list_files(path='./dictionaries/'):
    from tool.data import remove
    with open ('filelist.txt','w', encoding='utf-8') as f:
        for root, dirs, files in os.walk (path):
            for file in files:
                if re.match('apertium-.*?\.[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?.dix$', file):
                    name = '-'.join(l(i) for i in file.split('.')[-2].split('-'))
                    if name not in remove:
                        f.write(os.path.abspath(os.path.join(root, file)).replace("\\","/")+'\n')

In [9]:
list_files()

In [None]:
def monodix():
    "Create all monodixes and add them to monodix folder"
    all_languages()
    logging.info('Start')
    if not os.path.exists('./monodix/'):
        os.makedirs('./monodix/')
    for lang in langs:
        dictionary = one_language_dict(lang)
        with open ('./monodix/'+lang+'.dix', 'w', encoding = 'utf-16') as f:
            for i in dictionary_to_nodes(dictionary):
                f.write (i.write(mode='mono')+'\n')
    logging.info('Finish')

In [12]:
def all_languages():
    s = set()
    with open ('./tool/langs.py','w',encoding='utf-8') as outp:
        with open ('filelist.txt','r',encoding='utf-8') as inp:
            for line in inp:
                name = [l(i) for i in line.split('.')[-2].split('-')]
                s.update(name)
        outp.write('langs='+str(s))

In [13]:
all_languages()

In [14]:
from tool.langs import langs

In [16]:
def bidix():
    with open ('./tool/stats.csv','w',encoding='utf-8') as stats:
        with open('./filelist.txt', 'r', encoding = 'utf-8') as f:
            for line in f:
                file = line.strip('\n')
                name = [l(i) for i in line.split('.')[-2].split('-')]
                if '-'.join(name) in rename: name = rename['-'.join(pair)]
                with open (file, 'r', encoding='utf-8') as d:
                    with open ('./parsed/'+'-',join(name), 'w', encoding='utf-8') as copy:
                        count = [0,0,0]
                        try:
                            tree = ET.fromstring(re.sub('\s{3,}','\t', d.read().replace('<b/>',' ').replace('<.?g>','')))
                            for word1, word2, side in parse_bidix (tree, name[0], self.name[1]):
                                try:
                                    word1, word2 = check (word1, word2, self.mono1, self.mono2)
                                    if not side: count[0]+=1
                                    elif side == 'LR': count[1] += 1
                                    elif side == 'RL': count[2] += 1
                                    string = str(side) + '\t' + word1.write(mode='bi') + '\t' + word2.write(mode='bi') + '\n'
                                    copy.write(string)
                                except: pass
                        except: pass
                    stats.write('\t'.join(name) + '\t'+ '\t'.join(count)+'\n')

In [None]:
def preprocessing():
    all_languages()
    from tool.langs import langs
    monodix()
    bidix()

In [1]:
from tool.func import *

In [2]:
preprocessing()

2018-07-14 10:22:35,035 | INFO : Start
2018-07-14 10:25:59,821 | INFO : Finish


FileNotFoundError: [Errno 2] No such file or directory: './parsed/afr-nld'