<h3> Data Cleaning </h3>

In [1]:
import numpy as np
import pandas as pd

In [2]:
delegates = pd.read_excel("Data/Delegates/constitutional_convention_1787.xlsx", header = 2)

In [3]:
print("dimensions:", delegates.shape)
delegates.head()

dimensions: (55, 4)


Unnamed: 0,first name,last name,state,sign?
0,William Samuel,Johnson,Connecticut,yes
1,Roger,Sherman,Connecticut,yes
2,Oliver,Ellsworth (Elsworth),Connecticut,no
3,George,Read,Delaware,yes
4,Gunning,Bedford Jr.,Delaware,yes


In [4]:
#quick exploratory analysis
delegates = delegates.rename(columns={"sign?":"sign", 
                                      "first name":"first",
                                      "last name":"last"})

states = delegates.state.unique()
print("list of states:",states)
print("number of states:",len(states),"\n")

state_counts = delegates.state.value_counts()
print(state_counts, "\n")

responses = delegates.sign.unique()
print("response types:", responses)
print("num response types:",len(responses), "\n")

firstnamelen = pd.Series([len(x.split(" ")) for x in delegates['first']])
print("first name length distribution")
print(firstnamelen.value_counts())

lastnamelen = pd.Series([len(x.split(" ")) for x in delegates['last']])
print("last name length distribution")
print(lastnamelen.value_counts())

list of states: [' Connecticut' ' Delaware' ' Georgia' ' Maryland' ' Massachusetts'
 ' New Hampshire' ' New Jersey' ' New York' ' North Carolina'
 ' Pennsylvania' ' South Carolina' ' Virginia']
number of states: 12 

 Pennsylvania      8
 Virginia          7
 North Carolina    5
 Maryland          5
 Delaware          5
 New Jersey        5
 Massachusetts     4
 Georgia           4
 South Carolina    4
 New York          3
 Connecticut       3
 New Hampshire     2
Name: state, dtype: int64 

response types: [' yes' 'no' ' no']
num response types: 3 

first name length distribution
1    47
2     7
4     1
dtype: int64
last name length distribution
2    47
3     7
4     1
dtype: int64


In [5]:
#might want to separate names with parentheses into last name 1/2
sumaltfirst = sum(["(" in x for x in delegates['first']])
print("number of alt first names:", sumaltfirst)

sumaltlast = sum(["(" in x for x in delegates['last']])
print("number of alt last names:", sumaltlast)

number of alt first names: 0
number of alt last names: 4


In [6]:
#cleaning strings
delegates['state'] = [x.strip() for x in delegates.state]
delegates['sign'] = [x.strip() for x in delegates.sign]
delegates['first'] = [x.strip() for x in delegates['first']]
delegates['last'] = [x.strip() for x in delegates['last']]

In [7]:
delegates.head()

Unnamed: 0,first,last,state,sign
0,William Samuel,Johnson,Connecticut,yes
1,Roger,Sherman,Connecticut,yes
2,Oliver,Ellsworth (Elsworth),Connecticut,no
3,George,Read,Delaware,yes
4,Gunning,Bedford Jr.,Delaware,yes


In [9]:
responses = delegates.sign.unique()
print("response types:", responses)
print("num response types:",len(responses), "\n")

response types: ['yes' 'no']
num response types: 2 



In [23]:
loans = pd.read_csv("Data/Pre1790/loan_office_certificates_9_states_cleaned.csv")

In [25]:
loans.drop('Unnamed: 0', axis = 1)

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text
0,1,1778,3,13.0,Col,Joshua,Wentworth,,,,,,,200,108.27780,,
1,1,1777,9,2.0,,Charles,Treadwell,,,,,,,200,199.37780,,
2,1,1777,9,10.0,,Stephen,Cleverly,,,,,,,200,194.51110,,
3,1,1777,9,13.0,,David,Griffith,,,,,,,200,192.71110,,
4,1,1777,9,15.0,,John,Mansfield,,,,,,,200,191.52220,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80908,9,1780,1,1.0,,John,Hay,,,,,,,400,13.61667,,
80909,9,1779,5,25.0,,Isaac,Smith,,Thoroughgood,Smith,,,,300,22.86250,Isaac & Thoroughgood Smith,Isaac & Thoroughgood Smith
80910,9,1779,5,25.0,,Isaac,Smith,,Thoroughgood,Smith,,,,800,60.96667,Isaac & Thoroughgood Smith,Isaac & Thoroughgood Smith
80911,9,1779,3,13.0,,Samuel,Oldham,,,,,,,500,48.11806,,


In [24]:
loans.head()

SyntaxError: invalid character in identifier (<ipython-input-24-9c11566b4167>, line 1)

In [12]:
loans.columns

Index(['Unnamed: 0', 'State', 'Year', 'Month', 'Day', 'Title 1',
       'First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2',
       'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3', 'Face Value',
       'Specie Value ', 'notes', 'original text'],
      dtype='object')

In [14]:
loans = loans.rename(columns = {
                                "First Name 1 ":"First Name 1",
                                "Last Name 1 ":"Last Name 1"})
loans.head()

Unnamed: 0.1,Unnamed: 0,State,Year,Month,Day,title_1,first_name_1,last_name_1,title_2,first_name_2,last_name_2,Title 3,First Name 3,Last Name 3,face_value,specie_value,notes,original text
0,0,1,1778,3,13.0,Col,Joshua,Wentworth,,,,,,,200,108.2778,,
1,1,1,1777,9,2.0,,Charles,Treadwell,,,,,,,200,199.3778,,
2,2,1,1777,9,10.0,,Stephen,Cleverly,,,,,,,200,194.5111,,
3,3,1,1777,9,13.0,,David,Griffith,,,,,,,200,192.7111,,
4,4,1,1777,9,15.0,,John,Mansfield,,,,,,,200,191.5222,,


<h3> Testing FuzzyWuzzy </h3>

In [113]:
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [121]:
#initial test run
delegate_names = [str(f) + " " + str(l) for f,l in 
                  zip(delegates['first'], delegates['last'])]
delegate_names[:5]

['William Samuel Johnson',
 'Roger Sherman',
 'Oliver Ellsworth (Elsworth)',
 'George Read',
 'Gunning Bedford Jr.']

In [132]:
#Ratio
print("Testing using ratio")
print(delegate_names[2])
ratio1 = fuzz.ratio(delegate_names[2].lower(), "Oliver Ellsworth".lower())
ratio2 = fuzz.ratio(delegate_names[2].lower(), "Oliver Elsworth".lower())
print('Similarity score 1: {}'.format(ratio1))
print('Similarity score 2: {}\n'.format(ratio2))

print(delegate_names[0])
ratio1 = fuzz.ratio(delegate_names[0].lower(), "William Samuel Johnson".lower())
ratio2 = fuzz.ratio(delegate_names[0].lower(), "William Johnson".lower())
print('Similarity score 1: {}'.format(ratio1))
print('Similarity score 2: {}\n'.format(ratio2))

Testing using ratio
Similarity score 1: 74
Similarity score 2: 71

Similarity score 1: 100
Similarity score 2: 81



In [135]:
#Partial Ratio
print("Testing using partial ratio")
print(delegate_names[2])
ratio1 = fuzz.partial_ratio(delegate_names[2].lower(), "Oliver Ellsworth".lower())
ratio2 = fuzz.partial_ratio(delegate_names[2].lower(), "Oliver Elsworth".lower())
print('Similarity score 1: {}'.format(ratio1))
print('Similarity score 2: {}\n'.format(ratio2))

print(delegate_names[0])
ratio1 = fuzz.partial_ratio(delegate_names[0].lower(), "William Samuel Johnson".lower())
ratio2 = fuzz.partial_ratio(delegate_names[0].lower(), "William Johnson".lower())
print('Similarity score 1: {}'.format(ratio1))
print('Similarity score 2: {}\n'.format(ratio2))

Testing using partial ratio
Oliver Ellsworth (Elsworth)
Similarity score 1: 100
Similarity score 2: 93

William Samuel Johnson
Similarity score 1: 100
Similarity score 2: 67



In [136]:
#Token Sort Ratio
print("Testing using token sort ratio")
print(delegate_names[2])
ratio1 = fuzz.token_sort_ratio(delegate_names[2].lower(), "Oliver Ellsworth".lower())
ratio2 = fuzz.token_sort_ratio(delegate_names[2].lower(), "Oliver Elsworth".lower())
print('Similarity score 1: {}'.format(ratio1))
print('Similarity score 2: {}\n'.format(ratio2))

print(delegate_names[0])
ratio1 = fuzz.token_sort_ratio(delegate_names[0].lower(), "William Samuel Johnson".lower())
ratio2 = fuzz.token_sort_ratio(delegate_names[0].lower(), "William Johnson".lower())
print('Similarity score 1: {}'.format(ratio1))
print('Similarity score 2: {}\n'.format(ratio2))

Testing using token sort ratio
Oliver Ellsworth (Elsworth)
Similarity score 1: 78
Similarity score 2: 75

William Samuel Johnson
Similarity score 1: 100
Similarity score 2: 81



In [192]:
delegates['full_name'] = delegates['first'] + " " + delegates['last']
loan_names = []
for x, y in zip(loans['first_name_1'],loans['last_name_1']):
    try:
        math.isnan(y)
        try:
            math.isnan(x)
            loan_names.append(None)
        except:
            loan_names.append(x)
    except:
        loan_names.append(str(x) + " " + str(y))
loans['full_name_1'] = loan_names

In [187]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2

    return df_1

In [210]:
loans

Unnamed: 0,State,Year,Month,Day,title_1,first_name_1,last_name_1,title_2,first_name_2,last_name_2,face_value,specie_value,full_name_1
0,1,1778,3,13.0,Col,Joshua,Wentworth,,,,200,108.27780,Joshua Wentworth
1,1,1777,9,2.0,,Charles,Treadwell,,,,200,199.37780,Charles Treadwell
2,1,1777,9,10.0,,Stephen,Cleverly,,,,200,194.51110,Stephen Cleverly
3,1,1777,9,13.0,,David,Griffith,,,,200,192.71110,David Griffith
4,1,1777,9,15.0,,John,Mansfield,,,,200,191.52220,John Mansfield
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80908,9,1780,1,1.0,,John Hay,,,,,400,13.61667,John Hay
80909,9,1779,5,25.0,,Isaac & Thoroughgood Smith,,,,,300,22.86250,Isaac & Thoroughgood Smith
80910,9,1779,5,25.0,,Isaac & Thoroughgood Smith,,,,,800,60.96667,Isaac & Thoroughgood Smith
80911,9,1779,3,13.0,,Samuel Oldham,,,,,500,48.11806,Samuel Oldham


In [209]:
fuzzy_merge(delegates, loans, "full_name", "full_name_1", threshold = 90, limit = 5)

Unnamed: 0,first,last,state,sign,full_name,matches
0,William Samuel,Johnson,Connecticut,yes,William Samuel Johnson,"William Johnson, William Johnson, William John..."
1,Roger,Sherman,Connecticut,yes,Roger Sherman,"Roger Sherman, Roger Sherman, Roger Sherman, R..."
2,Oliver,Ellsworth (Elsworth),Connecticut,no,Oliver Ellsworth (Elsworth),"Oliver ElsworthEllsworth, Oliver Ellsworth"
3,George,Read,Delaware,yes,George Read,"George Read, George Read, George George, Georg..."
4,Gunning,Bedford Jr.,Delaware,yes,Gunning Bedford Jr.,"Gunning Bedford, Gunning Bedford, Gunning Bedf..."
5,John,Dickinson,Delaware,yes,John Dickinson,"John Dickinson, John Dickinson, John Dickinson..."
6,Richard,Bassett,Delaware,yes,Richard Bassett,Richard
7,Jacob,Broom,Delaware,yes,Jacob Broom,Jacob Broom
8,William,Few,Georgia,yes,William Few,"William, William"
9,Abraham,Baldwin,Georgia,yes,Abraham Baldwin,
