# GSoC Coding challenge

Current

In [1]:
import logging
import sys

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)

In [46]:
import json

In [48]:
import numpy as np

Future

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
import xml.etree.ElementTree as ET
import requests

## Loading dictionaries

### Git

In [2]:
import git

In [3]:
def clone_folder(name='apertium-trunk', target='./data/'):
    repo = git.Repo.clone_from('https://github.com/apertium/'+name+'/', target)
    for i in git.objects.submodule.root.RootModule(repo).list_items(repo):
        git.Repo.clone_from('https://github.com/apertium/'+i.name, './data/'+i.name)
        logging.info(i.name)

In [None]:
clone_folder()

2018-05-14 20:51:12,126 | INFO : apertium-en-gl
2018-05-14 20:54:12,730 | INFO : apertium-eo-fr
2018-05-14 20:54:17,434 | INFO : apertium-crh-tur
2018-05-14 20:57:49,735 | INFO : apertium-hbs-slv
2018-05-14 20:58:24,846 | INFO : apertium-swe-dan
2018-05-14 21:21:52,622 | INFO : apertium-br-fr
2018-05-14 21:23:38,604 | INFO : apertium-oc-ca
2018-05-14 21:56:55,938 | INFO : apertium-sme-nob
2018-05-14 21:59:23,435 | INFO : apertium-eo-ca
2018-05-14 22:02:04,268 | INFO : apertium-eu-es
2018-05-14 22:03:48,823 | INFO : apertium-srd-ita
2018-05-14 22:06:30,437 | INFO : apertium-spa-ita
2018-05-14 22:10:13,711 | INFO : apertium-eo-es
2018-05-14 22:11:33,539 | INFO : apertium-rus-ukr
2018-05-14 22:11:39,270 | INFO : apertium-pt-ca
2018-05-14 22:16:13,588 | INFO : apertium-dan-nor
2018-05-14 22:16:53,887 | INFO : apertium-oc-es
2018-05-14 22:17:49,422 | INFO : apertium-bel-rus
2018-05-14 22:26:17,121 | INFO : apertium-isl-eng
2018-05-14 22:27:33,579 | INFO : apertium-hbs-eng
2018-05-14 22:27:3

### PyGithub

In [None]:
from github import Github

In [None]:
with open ('secure.json') as f:
    SECRET = json.loads(f.read())

In [None]:
g = Github(SECRET['USER'], SECRET['PASSWORD'])

[Downloading file](https://sookocheff.com/post/tools/downloading-directories-of-code-from-github-using-the-github-api/)

[Docs](https://media.readthedocs.org/pdf/pygithub/stable/pygithub.pdf)

## Language codes

In [17]:
from numpy import nan

In [1]:
import pandas as pd

In [43]:
lang_codes = pd.read_csv('./files/language-codes-full_csv.csv', na_values = 0)
lang_codes = lang_codes[['alpha3-b','alpha2']]
lang_codes = lang_codes.dropna()

In [44]:
lang_codes.head()

Unnamed: 0,alpha3-b,alpha2
0,aar,aa
1,abk,ab
8,afr,af
10,aka,ak
12,alb,sq


In [50]:
lang_codes = {i[1]:i[0] for i in np.array(lang_codes)}

In [54]:
with open ('./files/lang_codes.json', 'w') as f:
    json.dump(lang_codes, f)

In [None]:
with open ('./files/lang_codes.json', 'w') as f:
    lang_codes = json.loads(f)

In [None]:
def l(lang):
    if len(lang)==2:
        if lang in lang_codes:
            return lang_codes[lang]
        else:
            return lang
    else:
        return lang

## Object classes

** Word **

- lemma : lemma
- lang : language
- pos : part of speech

In [None]:
class Word:
    def __init__(self, lemma, lang, pos, add=[]):
        self.lemma = lemma
        self.lang = lang
        self.pos = pos
        self.add = add
    
    def __str__(self):
        return (str(self.lang)+'_'+str(self.lemma)+'_'+str(self.pos))+'_'+str('-'.join(self.add))
    
    __repr__ = __str__
    
    def __eq__(self, other):
        return self.lemma == other.lemma and self.lang == other.lang and self.pos == other.pos
    
    def __hash__(self):
        return hash(str(self))

In [None]:
class Bidix():
    def __init__(self, lang1, lang2, file):
        self.lang1 = l(lang1)
        self.lang2 = l(lang2)
        self.file = file       

In [None]:
def parse_and_add(file, lang1, lang2, G):
    