In [1]:
from tool.evaluation import *
from tool.functions import *
import random
from tqdm import tqdm_notebook as tqdm
import numpy as np, scipy.stats as st
from itertools import islice

In [2]:
lang1, lang2, n = 'spa','eng', 20
%time get_relevant_languages(lang1, lang2)
%time load_file(lang1, lang2, n=n)
%time change_encoding('{}-{}'.format(lang1,lang2))
%time G = built_from_file('{}-{}'.format(lang1,lang2))
%time l1, l2 = dictionaries(lang1, lang2)

Wall time: 943 ms
Wall time: 931 ms
Wall time: 1.6 s
Wall time: 1min 57s
Wall time: 7.41 s


In [22]:
def evaluation(G, word, candidates, mode='exp', cutoff=4):
    result = {}
    for translation in candidates:
        result[translation] = metric(G, word, translation, cutoff=cutoff, mode=mode)
    return result

def lemma_search (G, lemma, d_l1, l2, cutoff, n, metric='exp'):
    lemmas = [i for i in d_l1.lemma(lemma) if i in G.nodes()]
    results = {word:{} for word in lemmas}
    for word in lemmas:
        candidates = possible_translations(G, word, l2, cutoff=cutoff, n=n)
        results[word] = evaluation(G, word, candidates, mode = metric, cutoff=cutoff)
        del candidates
    return results

def metric(G, word, translation, cutoff, mode='exp'):
    coef = 0
    if mode in ('exp', 'len'):
        t = Counter([len(i) for i in nx.all_simple_paths(G, word, translation, cutoff=cutoff)])
        if mode == 'exp': 
            for i in t: 
                #coef += exp(-t[i])
                coef += exp(-i)*t[i]
            return coef
        if mode == 'len':
            for i in t: 
                coef += t[i]*i
            return coef

In [25]:
%time print_lemma_results(lemma_search (G, 'star', l2, 'spa', 4, 40, metric='exp'), n = 5)

		 eng$star$[n_n-ND]
spa$estrella$[n-f_n_n-f-sg]	3.487083686952825
spa$astro$[n_n-m]	0.3445470134576176
spa$lucero$[n-m_n]	0.2963380437905006
spa$estelo$[n-m]	0.2712844579026809
spa$sentido$[n-m_n_n-m-ND]	0.15791432376100678

		 eng$star$[adj]
spa$estelar$[adj_adj-mf]	0.14032192213801603
spa$estupendo$[adj-GD_adj-GD-ND]	0.006737946999085467
spa$magnífico$[adj_adj-GD-ND_adj-GD]	0.006737946999085467

		 eng$star$[vblex]
spa$protagonizar$[vblex]	0.5861943804290437
spa$jugar$[vblex_vblex-vbact]	0.49363670887250133
spa$tocar$[vblex]	0.11938484387501619
spa$presentar$[vblex]	0.11074869476740849
spa$constar$[vblex]	0.09727280076923756

Wall time: 746 ms


In [24]:
%time print_lemma_results(lemma_search (G, 'star', l2, 'spa', 4, 40, metric='len'), n = 5)

		 eng$star$[n_n-ND]
spa$estrella$[n-f_n_n-f-sg]	1637
spa$astro$[n_n-m]	145
spa$lucero$[n-m_n]	138
spa$estelo$[n-m]	129
spa$sentido$[n-m_n_n-m-ND]	98

		 eng$star$[adj]
spa$estelar$[adj_adj-mf]	51
spa$estupendo$[adj-GD_adj-GD-ND]	5
spa$magnífico$[adj_adj-GD-ND_adj-GD]	5

		 eng$star$[vblex]
spa$jugar$[vblex_vblex-vbact]	294
spa$protagonizar$[vblex]	158
spa$tocar$[vblex]	79
spa$presentar$[vblex]	63
spa$actuar$[vblex]	60

Wall time: 776 ms


In [41]:
%time print_lemma_results(lemma_search (G, 'include', l2, 'spa', 4, 40, metric='exp'), n = 10)

		 eng$include$[n]
spa$inclusión$[n_n-f_n-f-sg]	0.09505449525293999

		 eng$include$[vblex-ger]
spa$implicación$[n-f_n_n-f-ND]	0.06548126788233244
spa$participación$[n-f_n_n-f-ND]	0.006737946999085467
spa$inclusión$[n_n-f_n-f-sg]	0.006737946999085467

		 eng$include$[vblex_vblex-pprs]
spa$comprender$[vblex]	1.0936909502681387
spa$incluir$[vblex]	1.0675870152154083
spa$coger$[vblex]	0.5389041357575773
spa$tomar$[vblex_vblex-inf]	0.24550763474911783
spa$añadir$[vblex]	0.1916040587564341
spa$llevar$[vblex]	0.18676431386587083
spa$dar$[vblex]	0.16949201565065547
spa$hacer$[vblex_vblex-inf]	0.14633663187135804
spa$agregar$[vblex]	0.1354821772474528
spa$participar$[vblex]	0.11938484387501619

Wall time: 1.44 s


In [43]:
%time print_lemma_results(lemma_search (G, 'convergence', l2, 'spa', 4, 40, metric='exp'), n = 10)

		 eng$convergence$[n_n-ND]
spa$convergencia$[n_n-f_n-f-ND]	0.663656474058993
spa$acuerdo$[n-m_n_n-m-sg]	0.0202138409972564
spa$confluencia$[n_n-f]	0.0202138409972564
spa$unión$[n_n-f_n-f-ND]	0.0202138409972564
spa$concordia$[n_n-f]	0.013475893998170934
spa$asociación$[n-f_n_n-f-sg]	0.013475893998170934
spa$armonía$[n-f_n_n-f-ND]	0.006737946999085467
spa$complicidad$[n_n-f_n-f-ND]	0.006737946999085467
spa$compromiso$[n-m_n_n-m-ND]	0.006737946999085467
spa$consenso$[n_n-m_n-m-sg]	0.006737946999085467

Wall time: 260 ms


In [44]:
import random
def get_evaluation_pairs(G, dictionary, target, n=500):
    k = 4
    pairs = []
    while len(pairs) < n:
        candidates = random.sample(dictionary, k*n)
        pairs = []
        for i in candidates:
            if i in G.nodes():
                s = FilteredList(list(G.neighbors(i))).lang(target)
                if len(s) == 1: pairs.append((i, s[0], n))
        print (k*n, len(pairs))
        k+=1
    return pairs[:n]

In [45]:
%time s = get_evaluation_pairs(G, l1, 'eng', n=1000)

4000 845
5000 1037
Wall time: 647 ms


In [47]:
%time result = evaluate(G, s, 'spa', 'eng', 4)
sum(result)/10

Wall time: 2min 25s


46.37972547103744

In [None]:
import matplotlib.pyplot as plt

In [None]:
%time result = evaluate(G, s, 'spa', 'eng', 6)
sum(result)/10

In [None]:
bins = 20
plt.hist(result, bins=bins)
arr=plt.hist(result,bins=bins)
for i in range(bins):
    plt.text(arr[1][i],arr[0][i],str(arr[0][i]))
plt.show()