In [2]:
# Imports
import numpy as np
import pandas as pd
import textdistance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import fuzzywuzzy
from fuzzywuzzy import fuzz
from collections import defaultdict

# Read full amicus, 'reduced' bonica and print initial stats
amicus = pd.read_csv('amicus_org_names.csv').drop(['Unnamed: 0'], axis=1).rename(columns={'x': 'amicus'})
bonica = pd.read_csv('bonica_orgs_reduced.csv', header=None, names=['index', 'bonica']).drop(['index'], axis=1)
amicus['amicus'] = amicus['amicus'].apply(lambda x: x.lower())
bonica['bonica'] = bonica['bonica'].apply(lambda x: x.lower())
print('Starting length of Amicus dataset: {} rows'.format(len(amicus)))
print('Starting length of Bonica dataset: {} rows'.format(len(bonica)))
print('Amicus dataset has {} unique elements'.format(len(sorted(list(set(amicus['amicus']))))))
print('Bonica dataset has {} unique elements'.format(len(sorted(list(set(bonica['bonica']))))))
print('There are {} exact matches between the Amicus and Bonica datasets'.format(len(set(amicus['amicus']).intersection(bonica['bonica']))))
total_set = set(amicus['amicus']).union(set(bonica['bonica']))
print('The union set contains {} elements'.format(len(sorted(list(total_set)))))

# Read in handcoded subset (matches between amicus, bonica) and print initial stats
handcoded = pd.read_csv('handcoded.csv')
handcoded = handcoded.drop(['Unnamed: 0'], axis=1)
handcoded['amicus'] = handcoded['amicus'].apply(lambda x: x.lower())
handcoded['bonica'] = handcoded['bonica'].apply(lambda x: x.lower())
handcoded_subset = set(handcoded['amicus']).union(set(handcoded['bonica']))
print('Starting length of handcoded Amicus-Bonica dataset: {} rows'.format(len(handcoded)))
print('Handcoded dataset has {} unique elements'.format(len(handcoded_subset)))

# Get set of elements not contained in handcoded subset
unmatched_set = total_set - handcoded_subset
print('Set of Bonica elements that have not been matched to Amicus elements contains {} rows'.format(len(unmatched_set)))

# Update amicus and bonica by removing handcoded strings
amicus_updated = amicus[~amicus['amicus'].isin(sorted(list(handcoded_subset)))]
bonica_updated = bonica[~bonica['bonica'].isin(sorted(list(handcoded_subset)))]
print('Length of Amicus dataset post-removal: {}'.format(len(amicus_updated)))
print('Length of Bonica dataset post-removal: {}'.format(len(bonica_updated)))

# Shuffle and reset index, then combine
amicus_updated_shuffled = amicus_updated.sample(frac=1).reset_index(drop=True)
bonica_updated_shuffled = bonica_updated.sample(frac=1).reset_index(drop=True)

combiner = []
for i in range(len(amicus_updated_shuffled)):
    combiner.append([amicus_updated_shuffled.iloc[i].values[0], bonica_updated_shuffled.iloc[i].values[0]])
combined = pd.DataFrame(combiner, columns=['amicus', 'bonica'])
combined['match'] = np.nan

# Check for stray handcoded strings, and confirm that strings come from correct sources
strays = []
for i in sorted(list(handcoded_subset)):
    if (i in combined['amicus']) or (i in combined['bonica']): 
        strays.append(i)
if len(strays) != 0: print('Stray matched string found!')
else: print('Handcoded strings successfully removed!')

for i in combined['bonica']:
    if (i in amicus['amicus']) or (i in amicus_updated['amicus']) or (i in amicus_updated_shuffled['amicus']):
        print('Error: string originally from bonica found in amicus column')
for i in combined['amicus']:
    if (i in bonica['bonica']) or (i in bonica_updated['bonica']) or (i in bonica_updated_shuffled['bonica']):
        print('Error: string originally from amicus found in bonica column')

Starting length of Amicus dataset: 13939 rows
Starting length of Bonica dataset: 1332470 rows
Amicus dataset has 13939 unique elements
Bonica dataset has 1332470 unique elements
There are 1767 exact matches between the Amicus and Bonica datasets
The union set contains 1344642 elements
Starting length of handcoded Amicus-Bonica dataset: 231 rows
Handcoded dataset has 279 unique elements
Set of Bonica elements that have not been matched to Amicus elements contains 1344497 rows
Length of Amicus dataset post-removal: 13860
Length of Bonica dataset post-removal: 1332364
Handcoded strings successfully removed!


In [13]:
import random

match_index = random.randint(0,len(combined))
print(match_index)

5009


In [48]:
num_todo = input('How many samples would you like to label: ')
if not (num_todo.isdigit()) and not (int(num_todo) >= 0):
    input('Please enter a number between 0 and {}: '.format(len(combined)))

How many samples would you like to label: -1
Please enter a number between 0 and 13860: r


In [43]:
amicus_todo = combined.iloc[match_index].values[0]
bonica_todo = combined.iloc[match_index].values[1]
match_todo = ''
print('If String #1 and String #2 match, please enter "1"\nIf they do not match, please enter "0"\nIf you are unsure, please enter "?"\n')
print('String #1: \''+amicus_todo+'\'\nString #2: \''+bonica_todo+'\'')

match = input('Match?')
if match == 'y':
    match_todo = 1
if match == 'n':
    match_todo = 0

If String #1 and String #2 match, please enter "1"
If they do not match, please enter "0"
If you are unsure, please enter "?"

String #1: 'california alliance for retired americans'
String #2: 'craftsman homes inc'
Match?y


KeyboardInterrupt: 

In [41]:
print(match_todo)




In [26]:
import sys
from sys import stdin

In [3]:
intersected = pd.merge(combined, handcoded, how='inner')
print(intersected)

Empty DataFrame
Columns: [amicus, bonica, match]
Index: []


In [None]:
# Write CSV of new viable validation and test data
combined.to_csv('data_viable_test.csv')

# Rename/rewrite handcoded match data to CSV
handcoded.to_csv('data_viable_train.csv')

# Append handcoded and new cleaned data to create CSV of full dataset
full_dataset = combined.append(handcoded)
full_dataset.to_csv('data_all.csv')
print('Number of rows in viable validation/test data: {}'.format(len(combined)))
print('Number of rows in hand-matched training data: {}'.format(len(handcoded)))
print('Number of rows in the full dataset: {}'.format(len(full_dataset)))