# Bilingual dictionary enrichment via graph completion

Current

In [1]:
import logging
import sys

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
import json
import numpy as np
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import requests

## Language codes

In [None]:
from numpy import na
import pandas as pd
lang_codes = pd.read_csv('./files/language-codes-full_csv.csv', na_values = 0)
lang_codes = lang_codes[['alpha3-b','alpha2']]
lang_codes = lang_codes.dropna()


lang_codes = [{i[0]:i[1] for i in np.array(lang_codes)}, {i[1]:i[0] for i in np.array(lang_codes)}]

with open ('./files/lang_codes.json', 'w') as f:
    json.dump(lang_codes, f)

In [4]:
with open ('./files/lang_codes.json', 'r') as f:
    lang_codes = json.load(f)

def l(lang, mode=3):
    mode = mode % 2
    if len(lang)==2:
        if lang in lang_codes[mode]:
            return lang_codes[mode][lang]
        else:
            return lang
    else:
        return lang
l('tt', 3)

'tat'

## Loading dictionaries

### Git - not relevant

In [2]:
import git

def clone_folder(name='apertium-trunk', target='./data/'):
    repo = git.Repo.clone_from('https://github.com/apertium/'+name+'/', target)
    for i in git.objects.submodule.root.RootModule(repo).list_items(repo):
        git.Repo.clone_from('https://github.com/apertium/'+i.name, './data/'+i.name)
        logging.info(i.name)

clone_folder()

Took 3 hours to clone aprtium-trunk

### PyGithub

** Load user with login and password from secret file **

In [42]:
from github import Github

with open ('secure.json') as f:
    SECRET = json.loads(f.read())

github = Github(SECRET['USER'], SECRET['PASSWORD'])

user = github.get_user('apertium')

In [None]:
user.get_repos()

** Generator ** : yield all repos that match name pattern

In [6]:
def repo_names(user):
    for repo in user.get_repos():
        if re.match('apertium-[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?', repo.name):
            yield repo.name

Looks like heavy function. But I don't see any improvements yet, except for having certain repo for all bidix copies. But this one above is the most up-to-date. It filters not languages pair repos, it is needed not to look for bidix where it can't be. Function saves a lot of time.

In [172]:
%time w = list(repo_names(user))

Wall time: 29.3 s


** Find bidix **

In [7]:
def bidix_url(repo):
    bidix = repo.name+'.'+repo.name.replace('apertium-','')+'.dix'
    for i in repo.get_dir_contents('/'):
        if re.match('apertium-.*?\.[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?.dix', i.path):
            return i.download_url

In [174]:
%time bidix_url(github.get_repo(user.name+'/'+w[0]))

Wall time: 1.59 s


'https://raw.githubusercontent.com/apertium/apertium-afr-nld/master/apertium-afr-nld.afr-nld.dix'

Speed is ok. 

Check repos for bidixes

In [9]:
def download(user):
    for repo_name in repo_names(user):
        bidix = bidix_url(github.get_repo(user.name+'/'+repo_name))
        if not bidix:
            print (repo_name)

download(user)

apertium-en-lv
apertium-est-nor
apertium-ita-srd
apertium-ky-en
apertium-lex-tools
apertium-on-github
apertium-ru-cu
apertium-sc-pt
apertium-urd-pan


All these repos do not have bidixes except for the last one. There are two of them. So that's strange.

** XML reading ** : return xml tree object. Read file with request from github and return object

In [8]:
def tree(url):
    response = requests.get(url)
    return ET.fromstring(response.content)

We need to check bidixes, because errors occur.

In [167]:
def download():
    for repo_name in repo_names(user):
        url = bidix_url(github.get_repo(user.name+'/'+repo_name))
        if url:
            lang = re.findall('\.([a-zA-Z_]{2,7})-([a-zA-Z_]{2,7})\.dix$', url)
            l1, l2 = lang[0][0], lang[0][1]
            try:
                t = tree(url)
            except:
                print(l1, l2)

In [168]:
%time download()

eo bg
eo fa
eo pl
fin fra
pl lv
sah eng
Wall time: 12min 6s


** Errors **

- eo-bg : strange header section with sdefs    ParseError: mismatched tag: line 10, column 4
- eo-fa : same, looks like < sdef n="n" > should be < sdef n="n"/>
- eo-pl : same
- fin-fra : < !-- \n {{{ Punctuatkion and stuff \n {{{ puncts --> ParseError: not well-formed (invalid token): line 152, column 4
- pl-lv : Possibly, not closed < alphabet > (dictionary) ParseError: mismatched tag: line 296, column 2
- sah-eng : " [< Russ. " in text, this is parsed as tag : ParseError: not well-formed (invalid token): line 339, column 114

In [None]:
tree('https://raw.githubusercontent.com/apertium/apertium-sah-eng/master/apertium-sah-eng.sah-eng.dix')

Loading all files (even without parsing) takes a lot of time (12 minutes on Windows). So to reduce time in case of gathering files instead of one folder on github we need to reduce number of languages we download.

** Only relevant for certain language pair **

There are **164 ** pairs at this moment

In [11]:
def get_repos_for_pair(user, l1, l2, n=[2]):
    logging.info('Start')
    G = nx.DiGraph()
    lg = (l(l1,3), l(l2,3))
    for name in repo_names(user):
        w = re.findall('.*?-([a-zA-Z_]{2,7})-([a-zA-Z_]{2,7})$', name)[0]
        w = (l(w[0],3), l(w[1],3))
        if w[0] == lg[0] or w[1] == lg[1]: G.add_edge(w[0],w[1])
        elif w[0] == lg[1] or w[1] == lg[0]: G.add_edge(w[1],w[0])
        else:
            G.add_edge(w[0],w[1])
            G.add_edge(w[1],w[0])
    if (lg[0], lg[1]) in G.edges(): G.remove_edge(lg[0], lg[1])
    if (lg[1], lg[0]) in G.edges(): G.remove_edge(lg[1], lg[0])
    logging.info('Graph')
    
    for i in n:
        w = nx.single_source_shortest_path_length(G, lg[0], cutoff=i)
        v = nx.single_source_shortest_path_length(G, lg[1], cutoff=i)
        nodes = list((set(w.keys())&set(w.keys()) )| set([lg[0],lg[1]]))
        H = G.subgraph(nodes)
        logging.info('Length: {}\tNodes: {}'.format(i, len(nodes)))
    return G

In [12]:
%time G = get_repos_for_pair(user, 'tat', 'rus', list(range(1,6)))

2018-05-16 21:44:02,582 | INFO : Start
2018-05-16 21:44:30,430 | INFO : Graph
2018-05-16 21:44:30,430 | INFO : Length: 1	Nodes: 8
2018-05-16 21:44:30,430 | INFO : Length: 2	Nodes: 70
2018-05-16 21:44:30,447 | INFO : Length: 3	Nodes: 134
2018-05-16 21:44:30,451 | INFO : Length: 4	Nodes: 145
2018-05-16 21:44:30,455 | INFO : Length: 5	Nodes: 146
Wall time: 27.9 s


In [79]:
%time G = get_repos_for_pair(user, 'urd', 'hin',  list(range(1,6)))

2018-05-16 10:14:52,693 | INFO : Start
2018-05-16 10:15:18,908 | INFO : Graph
2018-05-16 10:15:18,908 | INFO : Length: 1	Nodes: 4
2018-05-16 10:15:18,908 | INFO : Length: 2	Nodes: 4
2018-05-16 10:15:18,922 | INFO : Length: 3	Nodes: 4
2018-05-16 10:15:18,924 | INFO : Length: 4	Nodes: 4
2018-05-16 10:15:18,926 | INFO : Length: 5	Nodes: 4
Wall time: 26.2 s


In [80]:
%time G = get_repos_for_pair(user, 'epo', 'spa', list(range(1,6)))

2018-05-16 10:15:18,937 | INFO : Start
2018-05-16 10:15:45,703 | INFO : Graph
2018-05-16 10:15:45,705 | INFO : Length: 1	Nodes: 25
2018-05-16 10:15:45,707 | INFO : Length: 2	Nodes: 104
2018-05-16 10:15:45,710 | INFO : Length: 3	Nodes: 137
2018-05-16 10:15:45,713 | INFO : Length: 4	Nodes: 140
2018-05-16 10:15:45,716 | INFO : Length: 5	Nodes: 140
Wall time: 26.8 s


In [81]:
%time G = get_repos_for_pair(user, 'zul', 'hin',  list(range(1,6)))

2018-05-16 10:19:23,912 | INFO : Start
2018-05-16 10:19:49,860 | INFO : Graph
2018-05-16 10:19:49,860 | INFO : Length: 1	Nodes: 4
2018-05-16 10:19:49,860 | INFO : Length: 2	Nodes: 4
2018-05-16 10:19:49,865 | INFO : Length: 3	Nodes: 4
2018-05-16 10:19:49,867 | INFO : Length: 4	Nodes: 4
2018-05-16 10:19:49,870 | INFO : Length: 5	Nodes: 4
Wall time: 26 s


In [84]:
%time G = get_repos_for_pair(user, 'kaz', 'tur',  list(range(1,6)))

2018-05-16 10:21:21,185 | INFO : Start
2018-05-16 10:21:45,652 | INFO : Graph
2018-05-16 10:21:45,652 | INFO : Length: 1	Nodes: 14
2018-05-16 10:21:45,652 | INFO : Length: 2	Nodes: 85
2018-05-16 10:21:45,667 | INFO : Length: 3	Nodes: 142
2018-05-16 10:21:45,670 | INFO : Length: 4	Nodes: 146
2018-05-16 10:21:45,672 | INFO : Length: 5	Nodes: 146
Wall time: 24.5 s


** Final loading **

In [9]:
def get_repos_for_pair(user, l1, l2, n=[2]):
    logging.info('Start')
    pair_list = []
    G = nx.DiGraph()
    lg = (l(l1,3), l(l2,3))
    for name in repo_names(user):
        pair_list.append(name)
        w = re.findall('.*?-([a-zA-Z_]{2,7})-([a-zA-Z_]{2,7})$', name)[0]
        w = (l(w[0],3), l(w[1],3))
        if w[0] == lg[0] or w[1] == lg[1]: G.add_edge(w[0],w[1])
        elif w[0] == lg[1] or w[1] == lg[0]: G.add_edge(w[1],w[0])
        else:
            G.add_edge(w[0],w[1])
            G.add_edge(w[1],w[0])
    if (lg[0], lg[1]) in G.edges():
        G.remove_edge(lg[0], lg[1])
    if (lg[1], lg[0]) in G.edges():
        G.remove_edge(lg[1], lg[0])
    logging.info('Built graph')
    
    for i in n:
        w = nx.single_source_shortest_path_length(G, lg[0], cutoff=i)
        v = nx.single_source_shortest_path_length(G, lg[1], cutoff=i)
        nodes = list((set(w.keys())&set(w.keys()) )| set([lg[0],lg[1]]))
        H = G.subgraph(nodes)
        logging.info('Length: {}\tNodes: {}'.format(i, len(nodes)))
    
    number = int(input('What graph to choose?\t'))
    w = nx.single_source_shortest_path_length(G, lg[0], cutoff=number)
    v = nx.single_source_shortest_path_length(G, lg[1], cutoff=number)
    nodes = list((set(w.keys())&set(w.keys()) )| set([lg[0],lg[1]]))
    return set(nodes), pair_list

In [10]:
def filter_names(nodes:set, pair_list):
    for repo in pair_list:
        langs = re.findall('apertium-(.*?)-(.*?)$', repo)[0]
        if l(langs[0]) in nodes and l(langs[1]) in nodes:
            yield repo

In [66]:
def download_language_pair_support(user, l1, l2, n=[2]):
    nodes, pair_list = get_repos_for_pair(user, l1, l2, n)
    logging.info('Started loading')
    for repo_name in filter_names(nodes, pair_list):
        #print(user.name+'/'+repo_name)
        url = bidix_url(github.get_repo(user.name+'/'+repo_name))
        if url:
            lang = re.findall('.*?\.([a-zA-Z_]{2,7})-([a-zA-Z_]{2,7})\.dix$', url)
            if lang:
                l1, l2 = lang[0][0], lang[0][1]
                try:
                    t = tree(url)
                    yield t
                except:
                    #print(l1, l2)
                    pass
            else:
                print (url)
    logging.info('Finished')
        

In [36]:
list(download_language_pair_support(user, 'urd','hin', n=[2,3,4]))

2018-05-16 22:30:22,741 | INFO : Start
2018-05-16 22:30:48,962 | INFO : Built graph
2018-05-16 22:30:48,977 | INFO : Length: 2	Nodes: 4
2018-05-16 22:30:48,977 | INFO : Length: 3	Nodes: 4
2018-05-16 22:30:48,981 | INFO : Length: 4	Nodes: 4
What graph to choose?	2
2018-05-16 22:30:50,981 | INFO : Started loading
2018-05-16 22:30:59,286 | INFO : Finished


[<Element 'dictionary' at 0x000001CCDE33CEA8>,
 <Element 'dictionary' at 0x000001CCDE33CA98>,
 <Element 'dictionary' at 0x000001CCDE33CEF8>,
 <Element 'dictionary' at 0x000001CCDF85AE58>]

In [38]:
list(download_language_pair_support(user, 'tat','rus', n=[2,3,4]))[0]

2018-05-16 22:36:52,278 | INFO : Start
2018-05-16 22:37:20,187 | INFO : Built graph
2018-05-16 22:37:20,187 | INFO : Length: 2	Nodes: 70
2018-05-16 22:37:20,187 | INFO : Length: 3	Nodes: 134
2018-05-16 22:37:20,196 | INFO : Length: 4	Nodes: 145
What graph to choose?	4
2018-05-16 22:37:22,564 | INFO : Started loading
eo bg
eo fa
eo pl
fin fra
pl lv
sah eng
2018-05-16 22:51:21,585 | INFO : Finished


<Element 'dictionary' at 0x000001CCDF868188>

In [26]:
len('apertium-ur-pa.ur-pa.dix')

24

In [46]:
github.get_repo('apertium/apertium-as-hi').get_dir_contents('/')

[ContentFile(path=".gitattributes"),
 ContentFile(path=".gitignore"),
 ContentFile(path="ACKNOWLEDGEMENTS.txt"),
 ContentFile(path="AUTHORS"),
 ContentFile(path="Makefile"),
 ContentFile(path="apertium-as-hi.as-hi.dix"),
 ContentFile(path="apertium-as-hi.as-hi.t1x"),
 ContentFile(path="apertium-as-hi.as.dix"),
 ContentFile(path="apertium-as-hi.hi.dix"),
 ContentFile(path="as-hi.prob"),
 ContentFile(path="hi-as.prob"),
 ContentFile(path="modes.xml"),
 ContentFile(path="new")]

In [63]:
def bidix_url(repo):
    #print(repo)
    for i in sorted(repo.get_dir_contents('/'), key = lambda x: (len(x.path), 1000-ord(('   '+x.path)[-3])), reverse=True):
        #print (i.path)
        if re.match('apertium-.*?\.[a-z]{2,3}(_[a-zA-Z]{2,3})?-[a-z]{2,3}(_[a-zA-Z]{2,3})?.dix', i.path):
            return i.download_url
        elif len(i.path) < 23:
            return None

In [33]:
list(download_language_pair_support(user, 'urd','hin', n=[2,3,4]))[0]

2018-05-16 23:38:04,132 | INFO : Start
2018-05-16 23:38:30,729 | INFO : Built graph
2018-05-16 23:38:30,744 | INFO : Length: 2	Nodes: 4
2018-05-16 23:38:30,744 | INFO : Length: 3	Nodes: 4
2018-05-16 23:38:30,744 | INFO : Length: 4	Nodes: 4
What graph to choose?	4
2018-05-16 23:38:40,387 | INFO : Started loading
2018-05-16 23:38:45,419 | INFO : Finished


<Element 'dictionary' at 0x00000283861D7F48>

In [54]:
github.get_repo('Apertium/apertium-arg-cat').get_dir_contents('/')

[ContentFile(path=".gitattributes"),
 ContentFile(path=".gitignore"),
 ContentFile(path="AUTHORS"),
 ContentFile(path="COPYING"),
 ContentFile(path="ChangeLog"),
 ContentFile(path="Makefile.am"),
 ContentFile(path="NEWS"),
 ContentFile(path="README"),
 ContentFile(path="apertium-arg-cat.arg-cat.dix"),
 ContentFile(path="apertium-arg-cat.arg-cat.lrx"),
 ContentFile(path="apertium-arg-cat.arg-cat.t1x"),
 ContentFile(path="apertium-arg-cat.cat-arg.lrx"),
 ContentFile(path="apertium-arg-cat.cat-arg.t1x"),
 ContentFile(path="autogen.sh"),
 ContentFile(path="configure.ac"),
 ContentFile(path="dev"),
 ContentFile(path="genvrdix.py"),
 ContentFile(path="modes.xml"),
 ContentFile(path="t"),
 ContentFile(path="texts")]

In [58]:
for i in sorted(github.get_repo('Apertium/apertium-arg-cat').get_dir_contents('/'),key = lambda x: (len(x.path), 1000-ord(('   '+x.path)[-3])), reverse=True):
    print (i)

ContentFile(path="apertium-arg-cat.arg-cat.dix")
ContentFile(path="apertium-arg-cat.arg-cat.lrx")
ContentFile(path="apertium-arg-cat.cat-arg.lrx")
ContentFile(path="apertium-arg-cat.arg-cat.t1x")
ContentFile(path="apertium-arg-cat.cat-arg.t1x")
ContentFile(path=".gitattributes")
ContentFile(path="configure.ac")
ContentFile(path="Makefile.am")
ContentFile(path="genvrdix.py")
ContentFile(path="autogen.sh")
ContentFile(path=".gitignore")
ContentFile(path="ChangeLog")
ContentFile(path="modes.xml")
ContentFile(path="COPYING")
ContentFile(path="AUTHORS")
ContentFile(path="README")
ContentFile(path="texts")
ContentFile(path="NEWS")
ContentFile(path="dev")
ContentFile(path="t")


In [None]:
list(download_language_pair_support(user, 'tat','rus', n=[2,3,4]))[0]

2018-05-16 23:58:17,225 | INFO : Start
2018-05-16 23:58:47,891 | INFO : Built graph
2018-05-16 23:58:47,891 | INFO : Length: 2	Nodes: 70
2018-05-16 23:58:47,891 | INFO : Length: 3	Nodes: 134
2018-05-16 23:58:47,905 | INFO : Length: 4	Nodes: 145
What graph to choose?	4
2018-05-16 23:58:52,678 | INFO : Started loading
https://raw.githubusercontent.com/apertium/apertium-eo-cs/master/apertium-eo-cs.eo-cs.dix2
https://raw.githubusercontent.com/apertium/apertium-hbs-slv/master/apertium-hbs-slv.hbs-slv.dix.old


In [30]:
tree('https://raw.githubusercontent.com/apertium/apertium-urd-hin/master/apertium-urd-hin.urd-hin.dix')

<Element 'dictionary' at 0x000001CCDB4114A8>

## Object classes

** Word **

- lemma : lemma
- lang : language
- pos : part of speech

In [None]:
class Word:
    def __init__(self, lemma, lang, pos, add=[]):
        self.lemma = lemma
        self.lang = lang
        self.pos = pos
        self.add = add
    
    def __str__(self):
        return (str(self.lang)+'_'+str(self.lemma)+'_'+str(self.pos))+'_'+str('-'.join(self.add))
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and self.pos == other.pos
    
    def __hash__(self):
        return hash(str(self))

In [None]:
#