In [23]:
# %%
import numpy as np
import pandas as pd
import textdistance

# %%
# Read in preprocessed data (skips first block of code in R file)
handcoded = pd.read_csv('handcoded.csv')
handcoded = handcoded.drop(['Unnamed: 0'], axis=1)

# %%
# Vector of amicus strings concatenated with corresponding bonica strings
handcoded_vec = handcoded['amicus'].map(str) + '_' + handcoded['bonica']

# %%
# Create a set of incorrect matches
# First, copy the correct matches
tmp = handcoded.copy()
# Shuffle the amicus column - makes most of them mismatched
tmp['amicus'] = np.random.permutation(tmp['amicus'].values)
# For any that might still be correct matches, filter them out
# by making sure the concatenated string isn't in the vector of
# correct concatenated strings (handcoded_vec)
tmp_vec = tmp['amicus'].map(str) + '_' + tmp['bonica']
tmp = tmp[~tmp_vec.map(lambda x: handcoded_vec.str.contains(x).any())]
tmp['match'] = 0

# %%
# Get one more batch of incorrect matches
tmp2 = handcoded.copy()
tmp2['amicus'] = np.random.permutation(tmp2['amicus'].values)
tmp2_vec = tmp2['amicus'].map(str) + '_' + tmp2['bonica']
tmp2 = tmp2[~tmp2_vec.map(lambda x: handcoded_vec.str.contains(x).any())]
tmp2['match'] = 0

# %%
print(tmp.shape)
print(tmp2.shape)
print(handcoded.shape)

# %%
# Concatenate the incorrect ones, drop duplicates, and concatenate with the correct ones
tmp_full = pd.concat([tmp, tmp2])
tmp_full.drop_duplicates(inplace=True)
train = pd.concat([handcoded, tmp_full])
train['amicus'] = train['amicus'].str.lower()
train['bonica'] = train['bonica'].str.lower()


# %%
# Add more distance metrics?


#textdistance.cosine textdistance.jaccard
cos_list = []
jaccard_list = []
for i in range(train.shape[0]):
    first = train['amicus'].iloc[i]
    second = train['bonica'].iloc[i]
    cos_list.append(textdistance.cosine(first,second))
    jaccard_list.append(textdistance.jaccard(first, second))
    
trainpd.DataFrame({'cosine': cos_list, 'jaccard': jaccard_list})
    

(191, 3)
(188, 3)
(231, 3)


Unnamed: 0,cosine,jaccard
0,0.790912,0.653846
1,0.622376,0.451613
2,0.709575,0.521739
3,0.724882,0.566667
4,0.639602,0.450000
5,0.724882,0.566667
6,0.678401,0.500000
7,0.762770,0.615385
8,0.825723,0.681818
9,0.681818,0.517241
