# Read, join and clean Data

In [193]:
import pandas as pd

#Dict to transform labels to a continius number   
labelmap = {'not related':0,'overlap':.5,'same':1}

#Read all spreadsheets, and give standard names to columns (Original names are language specific)
allLabels = pd.read_excel('Synonym mapping.xlsx',sheetname=None)
for k in allLabels.keys():
        allLabels[k].columns = ['sec1','sec2','relation']

#Join labels and precomputed features (check Synonym.ipynb)
for k in allLabels.keys():
    lang = k[0:2]
    features = pd.read_excel('%sSynonyms_Stratified.xls' % lang,index=False)
    allLabels[k] = pd.merge(allLabels[k], features, how='left', left_on=['sec1','sec2'],right_on=['Sec_A', 'Sec_B'],suffixes=['',''])
    allLabels[k] = allLabels[k] [['sec1','sec2','relation', 'coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    allLabels[k]['lang']  = lang

    #allLabels[k]['relation']  = allLabels[k]['relation'].map(labelmap)

#Aggregate all data in one dataframe
allData = pd.concat(allLabels.values())
allData['binaryRelation'] = allData.relation.map({'not related':'not related','overlap':'related','same':'related'})

#drop pairs without label
allData.dropna(inplace=True)

#get all langs
langs = list(allData.lang.unique())

In [194]:
#we remove duplicates 
allData = allData.drop_duplicates() 
#We have disgrament in:
allData.duplicated(subset=['sec1','sec2','lang']).value_counts()

False    3441
True      221
dtype: int64

For simplicity I keep all labels. For future experiments this should be improved.

# Classification Experiments

In [201]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)





## Within Languages
We train and test on the same language

In [234]:
from random import choice
import random
random.seed(2)

In [237]:
test_size =.3
for lang in langs:
    data = allData[allData.lang == lang]
    Y = data['relation']
    X = data[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    X2 = poly.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=1)
    model = RandomForestClassifier(n_estimators=100,random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    randomClass = [choice(['not related','overlap','same']) for _ in range(0,len(y_pred))]
    print(lang)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Accuracy: %.2f%%" % (accuracy_score(y_test, randomClass) * 100.0))
    print("F1 Score: %.2f%%" % (100*f1_score(y_test, y_pred,average="weighted")))
    print("F1 Score Random: %.2f%%" % (100*f1_score(y_test, randomClass,average="weighted")))
    print(confusion_matrix(y_test, y_pred))
    
    


en
Accuracy: 73.23%
Accuracy: 32.28%
F1 Score: 71.38%
F1 Score Random: 35.57%
[[82  3  4]
 [10 10  6]
 [ 8  3  1]]
ar
Accuracy: 94.86%
Accuracy: 37.71%
F1 Score: 93.92%
F1 Score Random: 51.20%
[[159   0]
 [  9   7]]
es
Accuracy: 76.62%
Accuracy: 28.57%
F1 Score: 75.05%
F1 Score Random: 32.40%
[[46  2  0]
 [ 5 10  2]
 [ 0  9  3]]


  'recall', 'true', average, warn_for)


fr
Accuracy: 85.53%
Accuracy: 31.06%
F1 Score: 84.49%
F1 Score Random: 40.34%
[[182   4   4]
 [ 14  12   3]
 [  5   4   7]]
ja
Accuracy: 75.12%
Accuracy: 40.30%
F1 Score: 75.79%
F1 Score Random: 48.22%
[[142  13   6]
 [ 11   3   8]
 [  4   8   6]]
ru
Accuracy: 69.47%
Accuracy: 28.77%
F1 Score: 68.46%
F1 Score Random: 32.61%
[[184  15   1]
 [ 21   5  20]
 [  4  26   9]]


In [191]:
## Repeat same experiment by colapsing overlap and same as one 

In [240]:
test_size =.3
for lang in langs:
    data = allData[allData.lang == lang]
    Y = data['binaryRelation']
    X = data[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    X2 = poly.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X2, Y, test_size=test_size, random_state=1)
    model = RandomForestClassifier(n_estimators=40,random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    randomClass = [choice(['not related','related']) for _ in range(0,len(y_pred))]
    print(lang)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("AccuracyRandom: %.2f%%" % (accuracy_score(y_test, randomClass) * 100.0))
    print("F1 Score: %.2f%%" % (100*f1_score(y_test, y_pred,average="weighted")))
    print("F1 Score Random: %.2f%%" % (100*f1_score(y_test, randomClass,average="weighted")))
    print(confusion_matrix(y_test, y_pred))
    

en
Accuracy: 80.31%
AccuracyRandom: 44.09%
F1 Score: 79.47%
F1 Score Random: 46.20%
[[81  8]
 [17 21]]
ar
Accuracy: 95.43%
AccuracyRandom: 48.57%
F1 Score: 94.72%
F1 Score Random: 58.59%
[[159   0]
 [  8   8]]
es
Accuracy: 88.31%
AccuracyRandom: 63.64%
F1 Score: 88.27%
F1 Score Random: 64.19%
[[44  4]
 [ 5 24]]
fr
Accuracy: 89.79%
AccuracyRandom: 50.21%
F1 Score: 89.39%
F1 Score Random: 55.32%
[[182   8]
 [ 16  29]]
ja
Accuracy: 83.58%
AccuracyRandom: 49.25%
F1 Score: 84.06%
F1 Score Random: 54.05%
[[141  20]
 [ 13  27]]
ru
Accuracy: 84.91%
AccuracyRandom: 53.33%
F1 Score: 84.72%
F1 Score Random: 55.32%
[[182  18]
 [ 25  60]]


# Train on K-1 lang, test on the remaining

In [260]:
improvement = {'lang':[],'f1Score':[],'accuracy':[],'oneFeatureAcc':[],'oneFeatureF1':[]}
test_size =.3
for lang in langs:
    Y = allData['relation']
    X = allData[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    X_train = X[allData.lang != lang]
    X_test = X[allData.lang == lang]
    y_train = Y[allData.lang != lang]
    y_test =  Y[allData.lang == lang]
    model = RandomForestClassifier(n_estimators=40,random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    #create Baselines
    randomClass = [choice(['not related','overlap','same']) for _ in range(0,len(y_pred))]
    randAccuracy = accuracy_score(y_test, randomClass)

    modelOneFeature = RandomForestClassifier(n_estimators=40,random_state=1)
    modelOneFeature.fit(X_train[['vectorDistance']], y_train)
    y_predOneFeature = modelOneFeature.predict(X_test[['vectorDistance']])
    oneFeatureAccuracy = accuracy_score(y_test, randomClass)

    print(lang)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Accuracy: %.2f%%" % (randAccuracy * 100.0))
    print("F1 Score: %.2f%%" % (100*f1_score(y_test, y_pred,average="weighted")))
    print("F1 Score Random: %.2f%%" % (100*f1_score(y_test, randomClass,average="weighted")))
    print(confusion_matrix(y_test, y_pred))
    improvement['lang'].append(lang)
    improvement['f1Score'].append(accuracy-randAccuracy)
    improvement['accuracy'].append(f1_score(y_test, y_pred,average="weighted")-f1_score(y_test, randomClass,average="weighted"))
    improvement['oneFeatureAcc'].append(accuracy-oneFeatureAccuracy)
    improvement['oneFeatureF1'].append(f1_score(y_test, y_pred,average="weighted")-f1_score(y_test, y_predOneFeature,average="weighted"))
improvement = pd.DataFrame(improvement).set_index('lang')

en
Accuracy: 78.01%
Accuracy: 32.15%
F1 Score: 75.90%
F1 Score Random: 38.32%
[[298  11   6]
 [ 22  18  18]
 [ 28   8  14]]
ar
Accuracy: 75.64%
Accuracy: 33.28%
F1 Score: 82.23%
F1 Score Random: 45.92%
[[426  56  52]
 [  1   0   1]
 [ 13  19  15]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


es
Accuracy: 77.56%
Accuracy: 31.89%
F1 Score: 74.27%
F1 Score Random: 34.08%
[[166   1   1]
 [ 31  18  13]
 [  5   6  13]]
fr
Accuracy: 86.33%
Accuracy: 34.36%
F1 Score: 86.12%
F1 Score Random: 43.20%
[[620  22  15]
 [ 29  29  25]
 [ 11   5  27]]
ja
Accuracy: 58.21%
Accuracy: 35.22%
F1 Score: 60.60%
F1 Score Random: 41.71%
[[360  13 142]
 [ 65   9  25]
 [ 30   5  21]]
ru
Accuracy: 77.77%
Accuracy: 31.30%
F1 Score: 74.59%
F1 Score Random: 36.27%
[[654   8   6]
 [ 84  52  18]
 [ 42  53  32]]


  'precision', 'predicted', average, warn_for)


In [264]:
print(improvement.round(2)[['accuracy','f1Score']].to_latex())

\begin{tabular}{lrr}
\toprule
{} &  accuracy &  f1Score \\
lang &           &          \\
\midrule
en   &      0.38 &     0.46 \\
ar   &      0.36 &     0.42 \\
es   &      0.40 &     0.46 \\
fr   &      0.43 &     0.52 \\
ja   &      0.19 &     0.23 \\
ru   &      0.38 &     0.46 \\
\bottomrule
\end{tabular}



In [265]:
print(improvement.round(2)[['f1Score']].to_latex())

\begin{tabular}{lr}
\toprule
{} &  f1Score \\
lang &          \\
\midrule
en   &     0.46 \\
ar   &     0.42 \\
es   &     0.46 \\
fr   &     0.52 \\
ja   &     0.23 \\
ru   &     0.46 \\
\bottomrule
\end{tabular}



In [268]:
print(improvement.round(1).to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  accuracy &  f1Score &  oneFeatureAcc &  oneFeatureF1 \\
lang &           &          &                &               \\
\midrule
en   &       0.4 &      0.5 &            0.5 &           0.0 \\
ar   &       0.4 &      0.4 &            0.4 &          -0.0 \\
es   &       0.4 &      0.5 &            0.5 &           0.1 \\
fr   &       0.4 &      0.5 &            0.5 &           0.0 \\
ja   &       0.2 &      0.2 &            0.2 &           0.1 \\
ru   &       0.4 &      0.5 &            0.5 &           0.0 \\
\bottomrule
\end{tabular}



In [255]:
# repeat previous but with two classes
test_size =.3
improvement2 = {'lang':[],'f1Score':[],'accuracy':[]}

for lang in langs:
    Y = allData['binaryRelation']
    X = allData[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    X2 = poly.fit_transform(X)

    X_train = X2[allData.lang != lang]
    X_test = X2[allData.lang == lang]
    y_train = Y[allData.lang != lang]
    y_test =  Y[allData.lang == lang]
    model = RandomForestClassifier(n_estimators=40,random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    randomClass = [choice(['not related','related']) for _ in range(0,len(y_pred))]

    print(lang)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Accuracy: %.2f%%" % (randAccuracy * 100.0))
    print("F1 Score: %.2f%%" % (100*f1_score(y_test, y_pred,average="weighted")))
    print("F1 Score Random: %.2f%%" % (100*f1_score(y_test, randomClass,average="weighted")))
    print(confusion_matrix(y_test, y_pred))
    improvement2['lang'].append(lang)
    improvement2['f1Score'].append(accuracy-randAccuracy)
    improvement2['accuracy'].append(f1_score(y_test, y_pred,average="weighted")-f1_score(y_test, randomClass,average="weighted"))
improvement2 = pd.DataFrame(improvement2).set_index('lang')

en
Accuracy: 83.45%
Accuracy: 32.77%
F1 Score: 82.41%
F1 Score Random: 52.61%
[[296  19]
 [ 51  57]]
ar
Accuracy: 79.76%
Accuracy: 32.77%
F1 Score: 83.74%
F1 Score Random: 61.11%
[[428 106]
 [ 12  37]]
es
Accuracy: 84.65%
Accuracy: 32.77%
F1 Score: 83.61%
F1 Score Random: 48.13%
[[164   4]
 [ 35  51]]
fr
Accuracy: 90.29%
Accuracy: 32.77%
F1 Score: 90.23%
F1 Score Random: 55.10%
[[621  36]
 [ 40  86]]
ja
Accuracy: 70.90%
Accuracy: 32.77%
F1 Score: 72.59%
F1 Score Random: 54.03%
[[380 135]
 [ 60  95]]
ru
Accuracy: 86.09%
Accuracy: 32.77%
F1 Score: 85.04%
F1 Score Random: 51.32%
[[653  15]
 [117 164]]


In [277]:
print(improvement2.join(improvement,lsuffix='2_class').round(2)[['f1Score2_class','f1Score']].to_latex())

\begin{tabular}{lrr}
\toprule
{} &  f1Score2\_class &  f1Score \\
lang &                 &          \\
\midrule
en   &            0.51 &     0.46 \\
ar   &            0.47 &     0.42 \\
es   &            0.52 &     0.46 \\
fr   &            0.58 &     0.52 \\
ja   &            0.38 &     0.23 \\
ru   &            0.53 &     0.46 \\
\bottomrule
\end{tabular}



# All toghether (not considering languages)`

In [241]:
    test_size =.3
    Y = allData['relation']
    X = allData[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    X2 = poly.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X2, Y, test_size=test_size, random_state=1)

    model = RandomForestClassifier(n_estimators=40,random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    randomClass = [choice(['not related','overlap','same']) for _ in range(0,len(y_pred))]
    print(lang)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Accuracy: %.2f%%" % (accuracy_score(y_test, randomClass) * 100.0))
    print("F1 Score: %.2f%%" % (100*f1_score(y_test, y_pred,average="weighted")))
    print("F1 Score Random: %.2f%%" % (100*f1_score(y_test, randomClass,average="weighted")))
    print(confusion_matrix(y_test, y_pred))

ru
Accuracy: 80.44%
Accuracy: 34.12%
F1 Score: 79.36%
F1 Score Random: 41.80%
[[816  36  21]
 [ 63  31  29]
 [ 34  32  37]]


In [244]:
    test_size =.3
    Y = allData['binaryRelation']
    X = allData[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=1)
    model = RandomForestClassifier(n_estimators=40,random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    randomClass = [choice(['not related','related']) for _ in range(0,len(y_pred))]
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("AccuracyRandom: %.2f%%" % (accuracy_score(y_test, randomClass) * 100.0))
    print("F1 Score: %.2f%%" % (100*f1_score(y_test, y_pred,average="weighted")))
    print("F1 Score Random: %.2f%%" % (100*f1_score(y_test, randomClass,average="weighted")))
    print(confusion_matrix(y_test, y_pred))

Accuracy: 86.35%
AccuracyRandom: 49.14%
F1 Score: 86.03%
F1 Score Random: 53.93%
[[811  62]
 [ 88 138]]
