In [11]:
import pandas as pd
import numpy as np

In [76]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

train.head()

Unnamed: 0,embedding_x,embedding_y,equal
0,"0.35198048,0.23884374,0.20589112,0.033522803,0...","0.124149986,0.14133812,0.16054663,0.38359353,0...",1
1,"0.3492841,-0.18005244,0.46332806,-0.2122336,0....","-0.7460901,-0.21427503,-0.14653064,-0.22343078...",0
2,"0.5642844,-0.08399645,-0.018092873,0.18461637,...","-0.30552676,0.10498648,0.40437204,0.25560898,-...",0
3,"0.36132073,-0.20658037,0.4909494,0.12876663,0....","0.25936005,-0.15755327,0.4929783,-0.11315008,0...",1
4,"-0.039740197,-0.11732757,-0.07907649,0.1649697...","0.26061502,-0.31516105,-0.04165541,0.16747917,...",1


In [95]:
train['equal'].value_counts()

1    7837
0    7748
Name: equal, dtype: int64

Here we can that dataset is balanced, so we will use accuracy metric for evaluating threshold performance  

In [77]:
# cast embeddings from strings to np.arrays
train['embedding_x'] = train['embedding_x'].apply(lambda x: np.fromstring(x, dtype=float, sep=','))
train['embedding_y'] = train['embedding_y'].apply(lambda x: np.fromstring(x, dtype=float, sep=','))

test['embedding_x'] = test['embedding_x'].apply(lambda x: np.fromstring(x, dtype=float, sep=','))
test['embedding_y'] = test['embedding_y'].apply(lambda x: np.fromstring(x, dtype=float, sep=','))

In [78]:
train.head()

Unnamed: 0,embedding_x,embedding_y,equal
0,"[0.35198048, 0.23884374, 0.20589112, 0.0335228...","[0.124149986, 0.14133812, 0.16054663, 0.383593...",1
1,"[0.3492841, -0.18005244, 0.46332806, -0.212233...","[-0.7460901, -0.21427503, -0.14653064, -0.2234...",0
2,"[0.5642844, -0.08399645, -0.018092873, 0.18461...","[-0.30552676, 0.10498648, 0.40437204, 0.255608...",0
3,"[0.36132073, -0.20658037, 0.4909494, 0.1287666...","[0.25936005, -0.15755327, 0.4929783, -0.113150...",1
4,"[-0.039740197, -0.11732757, -0.07907649, 0.164...","[0.26061502, -0.31516105, -0.04165541, 0.16747...",1


In [79]:
# calculate Euclidean distance and cosine similarity
train['lin_dist'] = train.apply(lambda row: np.linalg.norm(row.embedding_x - row.embedding_y), axis=1)
train['cos_sim'] = train.apply(lambda row: np.dot(row.embedding_x, row.embedding_y)/(np.linalg.norm(row.embedding_x)*np.linalg.norm(row.embedding_y)), axis=1)

test['lin_dist'] = test.apply(lambda row: np.linalg.norm(row.embedding_x - row.embedding_y), axis=1)
test['cos_sim'] = test.apply(lambda row: np.dot(row.embedding_x, row.embedding_y)/(np.linalg.norm(row.embedding_x)*np.linalg.norm(row.embedding_y)), axis=1)

In [80]:
train.head()

Unnamed: 0,embedding_x,embedding_y,equal,lin_dist,cos_sim
0,"[0.35198048, 0.23884374, 0.20589112, 0.0335228...","[0.124149986, 0.14133812, 0.16054663, 0.383593...",1,0.941026,0.557235
1,"[0.3492841, -0.18005244, 0.46332806, -0.212233...","[-0.7460901, -0.21427503, -0.14653064, -0.2234...",0,1.629176,-0.327108
2,"[0.5642844, -0.08399645, -0.018092873, 0.18461...","[-0.30552676, 0.10498648, 0.40437204, 0.255608...",0,1.598868,-0.27819
3,"[0.36132073, -0.20658037, 0.4909494, 0.1287666...","[0.25936005, -0.15755327, 0.4929783, -0.113150...",1,0.674878,0.77227
4,"[-0.039740197, -0.11732757, -0.07907649, 0.164...","[0.26061502, -0.31516105, -0.04165541, 0.16747...",1,1.053113,0.445477


Here we got two ways for embeddings comparison, though we will proceed will cosine similarity.
To estimate the best threshhold we will choose the one that maximizes accuracy(minimizes -accuracy) for train dataset.

In [88]:
import scipy
from sklearn.metrics import accuracy_score

# function that maps threshold to -accuracy
def thr_to_accuracy(thr, Y_test, predictions):
   return -accuracy_score(Y_test, np.array(predictions>thr, dtype=int))

In [91]:
# choosing threshold that maximizes accuracy
best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(train['equal'].values, train['cos_sim'].values), x0=0.5)
best_thr

Optimization terminated successfully.
         Current function value: -0.883029
         Iterations: 14
         Function evaluations: 32


array([0.27460937])

In [99]:
# checking threshold for test dataset
test_acc = accuracy_score(test['equal'], np.array(test['cos_sim'].values>best_thr, dtype=int))

test_acc

0.8927655207798871

In [101]:
from sklearn.metrics import f1_score

# lets look at f1 score as well to ensure that result is valid
test_f1 = f1_score(test['equal'], np.array(test['cos_sim'].values>best_thr, dtype=int))

test_f1

0.8928754484879547