In [221]:
import numpy as np
import pandas as pd
import textdistance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import fuzzywuzzy
from fuzzywuzzy import fuzz
from collections import defaultdict

In [222]:
def match_name(list1, list2):
    out = []
    for i in list1:
        for j in list2:
            score = fuzz.ratio(i,j)
            out.append([i,j,score])
    return out

In [223]:
amicus = pd.read_csv('amicus_org_names.csv').drop(['Unnamed: 0'], axis=1).rename(columns={'x': 'amicus'})
bonica = pd.read_csv('bonica_orgs_reduced.csv', header=None, names=['index', 'bonica']).drop(['index'], axis=1)
amicus['amicus'] = amicus['amicus'].apply(lambda x: x.lower())
bonica['bonica'] = bonica['bonica'].apply(lambda x: x.lower())
print('Starting length of Amicus dataset: {} rows'.format(len(amicus)))
print('Starting length of Bonica dataset: {} rows'.format(len(bonica)))

Starting length of Amicus dataset: 13939 rows
Starting length of Bonica dataset: 1332470 rows


In [224]:
print('Amicus dataset has {} unique elements'.format(len(sorted(list(set(amicus['amicus']))))))
print('Bonica dataset has {} unique elements'.format(len(sorted(list(set(bonica['bonica']))))))
print('There are {} exact matches between the Amicus and Bonica datasets'.format(len(set(amicus['amicus']).intersection(bonica['bonica']))))
total_set = set(amicus['amicus']).union(set(bonica['bonica']))
print('The union set contains {} elements'.format(len(sorted(list(total_set)))))

Amicus dataset has 13939 unique elements
Bonica dataset has 1332470 unique elements
There are 1767 exact matches between the Amicus and Bonica datasets
The union set contains 1344642 elements


In [225]:
handcoded = pd.read_csv('handcoded.csv')
handcoded = handcoded.drop(['Unnamed: 0'], axis=1)
handcoded['amicus'] = handcoded['amicus'].apply(lambda x: x.lower())
handcoded['bonica'] = handcoded['bonica'].apply(lambda x: x.lower())
handcoded_subset = set(handcoded['amicus']).union(set(handcoded['bonica']))
print('Starting length of handcoded Amicus-Bonica dataset: {} rows'.format(len(handcoded)))
print('Handcoded dataset has {} unique elements'.format(len(handcoded_subset)))

Starting length of handcoded Amicus-Bonica dataset: 231 rows
Handcoded dataset has 279 unique elements


In [226]:
unmatched_set = total_set - handcoded_subset
print('Set of Bonica elements that have not been matched to Amicus elements contains {} rows'.format(len(unmatched_set)))

Set of Bonica elements that have not been matched to Amicus elements contains 1344497 rows


In [227]:
amicus_updated = amicus[~amicus['amicus'].isin(sorted(list(handcoded_subset)))]
bonica_updated = bonica[~bonica['bonica'].isin(sorted(list(handcoded_subset)))]

In [228]:
print('Length of Amicus dataset post-removal: {}'.format(len(amicus_updated)))
print('Length of Bonica dataset post-removal: {}'.format(len(bonica_updated)))

Length of Amicus dataset post-removal: 13860
Length of Bonica dataset post-removal: 1332364


In [229]:
amicus_updated_shuffled = amicus_updated.sample(frac=1).reset_index(drop=True)
bonica_updated_shuffled = bonica_updated.sample(frac=1).reset_index(drop=True)

In [234]:
combiner = []
for i in range(len(amicus_updated_shuffled)):
    combiner.append([amicus_updated_shuffled.iloc[i].values[0], bonica_updated_shuffled.iloc[i].values[0]])

In [235]:
combined = pd.DataFrame(combiner, columns=['amicus', 'bonica'])
combined['match'] = np.nan

In [237]:
strays = []
for i in sorted(list(handcoded_subset)):
    if (i in combined['amicus']) or (i in combined['bonica']): 
        strays.append(i)
if len(strays) != 0: print('Stray matched string found!')
else: print('Handcoded strings successfully removed!')

Handcoded strings successfully removed!


In [239]:
for i in combined['bonica']:
    if (i in amicus['amicus']) or (i in amicus_updated['amicus']) or (i in amicus_updated_shuffled['amicus']):
        print('Error: string originally from bonica found in amicus column')
for i in combined['amicus']:
    if (i in bonica['bonica']) or (i in bonica_updated['bonica']) or (i in bonica_updated_shuffled['bonica']):
        print('Error: string originally from amicus found in bonica column')

In [240]:
combined.to_csv('train_handcoded.csv')

In [242]:
full_dataset = combined.append(handcoded)

In [None]:
full_dataset.to_csv('all_data.csv')

In [256]:
print('Number of rows in viable validation/test data: {}'.format(len(combined)))
print('Number of rows in hand-matched training data: {}'.format(len(handcoded)))
print('Number of rows in the full dataset: {}'.format(len(full_dataset)))

Number of rows in viable validation/test data: 13860
Number of rows in hand-matched training data: 231
Number of rows in the full dataset: 14091
