# Imports

In [1]:
import pandas as pd
import numpy as np
from bokeh.plotting import ColumnDataSource, figure, output_file, show
from bokeh.io import output_notebook

In [2]:
output_notebook()

# Load Data

In [3]:
df = pd.read_pickle('first_two_sentences_EN_and_PT_POB.pkl')
df_PT = pd.read_csv('PT_result.csv', index_col=[0])
df_EN = pd.read_csv('EN_result.csv', index_col=[0])

In [4]:
PT_0_prob = df_PT['0_prob']
PT_1_prob = df_PT['1_prob']
PT_2_prob = df_PT['2_prob']
EN_0_prob = df_EN['0_prob']
EN_1_prob = df_EN['1_prob']
EN_2_prob = df_EN['2_prob']

In [5]:
Wikidata_PT = df_PT['wikidata']
Wikidata_EN = df_EN['wikidata']

In [6]:
df['PT_0_prob'] = PT_0_prob
df['PT_1_prob'] = PT_1_prob
df['PT_2_prob'] = PT_2_prob

In [7]:
df['EN_0_prob'] = EN_0_prob
df['EN_1_prob'] = EN_1_prob
df['EN_2_prob'] = EN_2_prob

In [8]:
df['Wikidata_PT'] = Wikidata_PT
df['Wikidata_EN'] = Wikidata_EN

In [9]:
print(df.shape)

(30002, 11)


In [10]:
df.head()

Unnamed: 0,Qid,sentence_EN,sentence_PT,PT_0_prob,PT_1_prob,PT_2_prob,EN_0_prob,EN_1_prob,EN_2_prob,Wikidata_PT,Wikidata_EN
0,Q1015531,ludmila manicler born 6 july 1987 is an argent...,ludmila manicler 6 de julho de 1987 é uma fut...,0.015466,0.829742,0.154792,0.961972,0.008174,0.029854,local de nascimento San Pedro (Buenos Aires),place of birth San Pedro
1,Q1050388,was a japanese general who served during the s...,foi um general do exército imperial japonês ...,0.002883,0.009499,0.987618,0.00072,0.000418,0.998862,local de nascimento Kagoshima,place of birth Kagoshima
2,Q10526787,is a japanese football player currently playin...,"wataru endo ibaraki, 9 de fevereiro de 1993 é...",0.004639,0.986745,0.008616,0.000745,0.000572,0.998683,local de nascimento Iocoama,place of birth Yokohama
3,Q105695,"annasophia robb born december 8, 1993 is an am...","annasophia robb denver, colorado, 8 de dezemb...",0.691228,0.009083,0.299689,0.907669,0.017064,0.075267,local de nascimento Denver,place of birth Denver
4,Q1087146,"christos dimitriou papakyriakopoulos , commonl...","christos dimitriou papakyriakopoulos, mais co...",0.875275,0.007232,0.117493,0.965165,0.007697,0.027139,local de nascimento Atenas,place of birth Athens


# Plot

In [11]:
output_file("EN_PT_inconsistent_Prob_Plot.html")

source = ColumnDataSource(data=dict(
    x=df['PT_1_prob'],
    y=df['EN_1_prob'],
    desc=df['Qid'],
))

TOOLTIPS = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("desc", "@desc"),
]

p = figure(plot_width=2000, plot_height=2000, tooltips=TOOLTIPS, title="EN_PT_inconsistent_Prob_Plot")
p.xaxis.axis_label = 'PT_inconsistent_prob'
p.yaxis.axis_label = 'EN_inconsistent_prob'

p.circle('x', 'y', size=3, source=source)

In [12]:
show(p)

In [13]:
output_file("EN_PT_consistent_Prob_Plot.html")

source = ColumnDataSource(data=dict(
    x=df['PT_0_prob'],
    y=df['EN_0_prob'],
    desc=df['Qid'],
))

TOOLTIPS = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("desc", "@desc"),
]

p0 = figure(plot_width=2000, plot_height=2000, tooltips=TOOLTIPS, title="EN_PT_consistent_Prob_Plot")
p0.xaxis.axis_label = 'PT_consistent_prob'
p0.yaxis.axis_label = 'EN_consistent_prob'

p0.circle('x', 'y', size=3, source=source)

In [14]:
show(p0)

In [15]:
output_file("EN_PT_irrelevant_Prob_Plot.html")

source = ColumnDataSource(data=dict(
    x=df['PT_2_prob'],
    y=df['EN_2_prob'],
    desc=df['Qid'],
))

TOOLTIPS = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("desc", "@desc"),
]

p2 = figure(plot_width=2000, plot_height=2000, tooltips=TOOLTIPS, title="EN_PT_irrelevant_Plot")
p2.xaxis.axis_label = 'PT_irrelevant_prob'
p2.yaxis.axis_label = 'EN_irrelevant_prob'

p2.circle('x', 'y', size=3, source=source)

In [16]:
show(p2)

# 

In [17]:
df_PT_prob = df[['PT_0_prob', 'PT_1_prob', 'PT_2_prob']].to_numpy()
df_EN_prob = df[['EN_0_prob', 'EN_1_prob', 'EN_2_prob']].to_numpy()

In [18]:
pred_PT = np.argmax(df_PT_prob, axis=1)
pred_EN = np.argmax(df_EN_prob, axis=1)

In [19]:
labels_PT, counts_PT = np.unique(pred_PT, return_counts=True)
print("Frequency of label of PT:")
print(np.asarray((labels_PT, counts_PT)))

Frequency of label of PT:
[[    0     1     2]
 [17649  8022  4331]]


In [20]:
labels_EN, counts_EN = np.unique(pred_EN, return_counts=True)
print("Frequency of label of EN:")
print(np.asarray((labels_EN, counts_EN)))

Frequency of label of EN:
[[    0     1     2]
 [18769    93 11140]]


In [21]:
df.to_pickle('Final_result_EN_PT.pkl')

# Check Top K

In [22]:
df.sort_values(by='PT_1_prob', axis=0, ascending=False)

Unnamed: 0,Qid,sentence_EN,sentence_PT,PT_0_prob,PT_1_prob,PT_2_prob,EN_0_prob,EN_1_prob,EN_2_prob,Wikidata_PT,Wikidata_EN
24830,Q552925,"arlo davy guthrie born july 10, 1947 is an ame...",thumb|right|250px|o músico em 2007 thumb|righ...,0.004106,0.992869,0.003025,0.387842,0.071424,0.540734,local de nascimento Coney Island,place of birth Coney Island
29458,Q204018,"jermaine lamarr cole born january 28, 1985 is ...","jermaine lamarr cole frankfurt, , mais conhec...",0.003926,0.992803,0.003270,0.962977,0.008027,0.028996,local de nascimento Francoforte do Meno,place of birth Frankfurt am Main
668,Q390491,"roger frederick wicker born july 5, 1951 is an...",200px|thumb|roger frederick wicker em foto of...,0.003910,0.992709,0.003381,0.963770,0.008068,0.028162,local de nascimento Pontotoc,place of birth Pontotoc
570,Q881196,"paul richard lepage ; born october 9, 1948 is ...",200px|thumb|paul lepage paul r lepage nascido...,0.003787,0.992706,0.003506,0.966344,0.008210,0.025446,local de nascimento Lewiston,place of birth Lewiston
12443,Q381880,"sherrod campbell brown , born november 9, 1952...",200px|thumb|sherrod brown sherrod campbell br...,0.003652,0.992694,0.003654,0.965382,0.007710,0.026907,local de nascimento Mansfield,place of birth Mansfield
...,...,...,...,...,...,...,...,...,...,...,...
19218,Q378003,"david da-i ho ; born november 3, 1952 is a tai...",david da-i ho ; 3 de novembro de 1952sen russ...,0.003036,0.002548,0.994415,0.960661,0.007806,0.031533,local de nascimento Taichung,place of birth Taichung
21332,Q317947,crates gave away his money to live a life of p...,"crates ; c 365 – c 285 bctiziano dorandi, cha...",0.002927,0.002513,0.994559,0.000895,0.000383,0.998723,local de nascimento Tebas,place of birth Thebes
12455,Q470531,"encyclopaedia of the history of science, techn...",",selin, helaine 1997 encyclopaedia of the his...",0.003238,0.002485,0.994277,0.000770,0.000555,0.998676,local de nascimento Fujioka,place of birth Fujioka
11905,Q725400,"july 21, 1929 – july 16, 2006 was an american ...","se quiser ver o filho de bob orton, sr consul...",0.002230,0.002437,0.995333,0.000691,0.000387,0.998922,local de nascimento Kansas City,place of birth Kansas City
