In [20]:
import numpy as np
import pandas as pd
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import time

In [4]:
#setting header = 2 starts reading at line 3 so you ignore the junk before

In [11]:
df1 = pd.read_excel('Data/constitutional_convention_1787.xlsx', header = 2)
df2 = pd.read_csv('loan_office_certificates_9_states_cleaned.csv').drop('Unnamed: 0', axis = 1)

In [12]:
df1.head()

Unnamed: 0,first name,last name,state,sign?
0,William Samuel,Johnson,Connecticut,yes
1,Roger,Sherman,Connecticut,yes
2,Oliver,Ellsworth (Elsworth),Connecticut,no
3,George,Read,Delaware,yes
4,Gunning,Bedford Jr.,Delaware,yes


### Optimization 1: Loops

In [13]:
#right now you are running four for loops, each one embedded in the other
#instead of running four you can run two, by using full name (first name + last name)
#this way you are only making one score comparison, not two

In [16]:
df1['full name 1'] = df1['first name'] + " " + df1['last name']
df2['full name 1'] = df2['First Name 1 '] + " " + df2['Last Name 1 ']

In [17]:
#reduces your code to the below

In [23]:
start_time = time.time()
for fullname1 in df1['full name 1']:
    for fullname2 in df2['full name 1']:
        if fuzz.token_set_ratio(fullname1, fullname2) > 90:
            print(fullname1, "might be the same person as ", fullname2)
print("This took", time.time() - start_time, "to run")

William Samuel  Johnson might be the same person as  Samuel ??
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same person as  William Johnson
William Samuel  Johnson might be the same pers

### Optimization 2: Number of names

In [19]:
#instead of iterating through every name, just iterate through a set of unique names - that way we avoid repetitive comparisons
#if you look at the results from above you see a lot of repetition

In [24]:
delegate_names = df1['full name 1'].unique()
loan_names = df2['full name 1'].unique()

In [25]:
start_time = time.time()
for fullname1 in delegate_names:
    for fullname2 in loan_names:
        if fuzz.token_set_ratio(fullname1, fullname2) > 90:
            print(fullname1, "might be the same person as ", fullname2)
print("This took", time.time() - start_time, "to run")

William Samuel  Johnson might be the same person as  Samuel ??
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  William Johnson
Roger  Sherman might be the same person as  Roger Sherman
Oliver  Ellsworth (Elsworth) might be the same person as  Oliver ElsworthEllsworth
Oliver  Ellsworth (Elsworth) might be the same person as  Oliver Ellsworth
George  Read might be the same person as  George George
George  Read might be the same person as  George Reed
George  Read might be the same person as  George Read
George  Read might be the same person as  George Reid
Gunning  Bedford Jr. might be the same person as  Gunning Bedford
John  Dickinson might be the same person as  John Dickinson
John  Dickinson might be the same person as  John Dickenson
Jacob  Broom might be the same person as  Jacob Broom
William L.  Pierce might be the same person as  William Pierce
James  McHenry might be the same person as  Henry James
James  

### Optimization 3: Vectorized for loops

In [26]:
#you can also use vectorization - by calling functions like apply on the pandas series itself instead of iterating through the series/list
#this optimization is minimal - goes from 22.1 to 21.9 seconds

In [27]:
def similarNames(x, lst):
    for name in lst:
        if fuzz.token_set_ratio(x, name) > 90:
            print(x, "might be the same person as ", name)

In [29]:
start_time = time.time()
delegate_series = pd.Series(delegate_names)
delegate_series.apply(lambda x: similarNames(x, loan_names))
print("This took", time.time() - start_time, "to run")

William Samuel  Johnson might be the same person as  Samuel ??
William Samuel  Johnson might be the same person as  Samuel Johnson
William Samuel  Johnson might be the same person as  William Johnson
Roger  Sherman might be the same person as  Roger Sherman
Oliver  Ellsworth (Elsworth) might be the same person as  Oliver ElsworthEllsworth
Oliver  Ellsworth (Elsworth) might be the same person as  Oliver Ellsworth
George  Read might be the same person as  George George
George  Read might be the same person as  George Reed
George  Read might be the same person as  George Read
George  Read might be the same person as  George Reid
Gunning  Bedford Jr. might be the same person as  Gunning Bedford
John  Dickinson might be the same person as  John Dickinson
John  Dickinson might be the same person as  John Dickenson
Jacob  Broom might be the same person as  Jacob Broom
William L.  Pierce might be the same person as  William Pierce
James  McHenry might be the same person as  Henry James
James  