# Cleaning Names

The purpose of this notebook is to clean the names of individuals. All the problems that we aim to fix in this notebook are listed [here](https://docs.google.com/document/d/1pcSQfWNll6K9tl-_rB4lztN0TsZsclU9vOnbyQob-Zs/edit).

## Cleaning Table Columns

Not all tables share the same columns. Therefore, its important to take time to clean these columns. We do this by standardizing the names of columns across all the debt tables. Next, we add a state column to each table. This column will be useful when we merge all these tables together at the end.

In [2]:
# import all the necessary packages
import pandas as pd 
import numpy as np
import re
from nameparser import HumanName

In [211]:
def clean_table(table, drp_cols):
    table.drop(columns=drp_cols, inplace=True, axis=1)
    table.columns = table.columns.to_flat_index() 
    
    for column in table.columns:
        if 'Unnamed' in column[1]:
            table.rename(columns={column:(column[0],'')}, inplace=True)
    
    table.rename(columns=lambda x: x[0].lower().strip() + ' | ' + x[1].lower().strip() if (x[1] != '') else x[0].lower().strip(), inplace=True) # lowercase column titles 
    table.rename(columns={'state | ' : 'state'}, inplace=True)
    return table

In [228]:
changes = {
    'to whom due (if second name) | first name':'to whom due | first name.1',
    'to whom due (if second name) | last name':'to whom due | last name.1',
    'to whom due (if second name) | title':'to whom due | title.1', 
    'time when the debt became due | dollars':'amount | dollars',
    'time when the debt became due | 90th':'amount | 90th',
    'time when the debt became due | date':'time when the debt became due | day',
    'line strike thorugh? | yes?':'line strike through? | yes?',
    'line strike thorugh? | notes':'notes',
    'line strike through? | notes':'notes',
    'line strike thorugh? | note':'line strike through? | note',
    'line strike through?' : 'line strike through? | note',
    'date of the certificate | date':'date of the certificate | day',
    'to whom issued | title':'to whom due | title',
    'to whom issued | first name':'to whom due | first name',
    'to whom issued | last name':'to whom due | last name',
    'comm of interest | year':'time when the debt became due | year',
    'comm of interest | month':'time when the debt became due | month',
    'comm of interest | date':'time when the debt became due | day',
    'comm of interest | dollars':'amount | dollars',
    'comm of interest | 90th':'amount | 90th',
    'comm of interest | 10th':'amount | 10th',
    'w | dollars':'amount | dollars',
    'w | 90th':'amount | 90th',
    'w | 8th':'amount | 8th',
    'year':'date of the certificate | year', 
    'month':'date of the certificate | month',
    'day':'date of the certificate | day', 
    'title 1':'to whom due | title', 
    'first name 1':'to whom due | first name',
    'last name 1':'to whom due | last name',
    'title 2':'to whom due | title.1', 
    'first name 2':'to whom due | first name.1',
    'last name 2':'to whom due | last name.1',
    'specie value':'amount in specie | dollars',
    'line strike thorugh? | yes?':'line strike through? | yes?',
    'line strike thorugh? | note':'line strike through? | note',
    'line strike thorugh? | notes':'notes',
    'face value':'amount | dollars'
}

In [229]:
# handle the liquidated debt certificates first for each file and merge into 1 dataframe
ct_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_CT.xlsx", header=[10,11])
de_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_DE.xlsx", header=[9,10])
ma_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_MA.xlsx", header=[10,11])
nh_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_NH.xlsx", header=[10,11])
nj_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_NJ.xlsx", header=[9,10])
ny_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_NY.xlsx", header=[10,11])
pa_stelle_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_PA_stelle.xlsx", header=[10,11])
pa_story_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_PA_story.xlsx", header=[10,11])
ri_debt = pd.read_excel("../../data_raw/pre1790/liquidated_debt_certificates_RI.xlsx", header=[10,11])
loan_9_debt = pd.read_excel("../../data_raw/pre1790/loan_office_certificates_9_states.xlsx", header=0)
marine_debt = pd.read_excel("../../data_raw/pre1790/Marine_Liquidated_Debt_Certificates.xlsx", header=[10, 11])

# add a state column to each dataframe
ct_debt['state'] = 'ct'
de_debt['state'] = 'de'
ma_debt['state'] = 'ma'
nh_debt['state'] = 'nh'
nj_debt['state'] = 'nj'
ny_debt['state'] = 'ny'
pa_stelle_debt['state'] = 'pa'
pa_story_debt['state'] = 'pa'
ri_debt['state'] = 'ri'

ny_drp_cols = ['Page', 'JPEG number', 'Number']
ny_debt = clean_table(ny_debt, ny_drp_cols)

# connecticut 
ct_drp_cols = ['Register Page', 'JPEG number', 'Number']
ct_debt = clean_table(ct_debt, ct_drp_cols)
# manual fixes
ct_debt.rename(columns=changes, inplace=True)
ct_debt['org_file'] = 'liquidated_debt_certificates_CT.xlsx'
print(ct_debt.dtypes)
print()

# pennsylvania: stelle
pa_stelle_drp = ['Register Page', 'JPEG number', 'No.']
pa_stelle_debt = clean_table(pa_stelle_debt, pa_stelle_drp)
# manual fixes
pa_stelle_debt.rename(columns=changes, inplace=True)
pa_stelle_debt['org_file'] = 'liquidated_debt_certificates_PA_stelle.xlsx'
print(pa_stelle_debt.dtypes)
print()

#pennsylvania: story 
pa_story_drp = ['Register Page', 'JPEG number', 'No.']
pa_story_debt = clean_table(pa_story_debt, pa_story_drp)
# manual fixes
pa_story_debt.rename(columns=changes, inplace=True)
pa_story_debt.columns.values[14] = 'amount in specie | dollars'
pa_story_debt.columns.values[15] = 'amount in specie | cents'
pa_story_debt['org_file'] = 'liquidated_debt_certificates_PA_story.xlsx'
print(pa_story_debt.dtypes)
print()

# rhode island 
ri_drp = ['Register Page', 'JPEG number', 'Number']
ri_debt = clean_table(ri_debt, ri_drp)
# manual fixes
ri_debt.rename(columns=changes, inplace=True)
ri_debt['org_file'] = 'liquidated_debt_certificates_RI.xlsx'
print(ri_debt.dtypes)
print()

# 9 states loan certificates
state_nums = {
    1: 'nh', 2: 'ma', 3: 'ct', 4: 'ny', 5: 'nj', 6: 'pa', 7: 'de', 8: 'md', 9: 'va'
}
loan_9_debt['State'] = loan_9_debt['State'].apply(lambda state_num: state_nums[state_num])
loan_9_debt.rename(columns=lambda x: x.lower().strip(), inplace=True)
# manual fixes
loan_9_debt.rename(columns=changes, inplace=True)
loan_9_debt['org_file'] = 'loan_office_certificates_9_states.xlsx'
print(loan_9_debt.dtypes)
print()

# marine debt 
marine_drp = ['Page', 'JPEG number', 'Number']
marine_debt = clean_table(marine_debt, marine_drp)
marine_debt.rename(columns=changes, inplace=True)
marine_debt.columns.values[12] = 'total dollars | notes'
marine_debt.columns.values[13] = 'total dollars | notes.1'
marine_debt['org_file'] = 'Marine_Liquidated_Debt_Certificates.xlsx'
print(marine_debt.dtypes)
print()


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


letter                                    object
date of the certificate | month          float64
date of the certificate | day            float64
date of the certificate | year           float64
to whom due | first name                  object
to whom due | last name                   object
to whom due | title                       object
to whom due | first name.1                object
to whom due | last name.1                 object
to whom due | title.1                    float64
time when the debt became due | month    float64
time when the debt became due | day      float64
time when the debt became due | year     float64
amount | dollars                         float64
amount | 90th                            float64
line strike through? | yes?              float64
line strike through? | note               object
notes                                     object
state                                     object
org_file                                  object
dtype: object

lette

  warn(msg)
  table.drop(columns=drp_cols, inplace=True, axis=1)
  table.drop(columns=drp_cols, inplace=True, axis=1)
  table.drop(columns=drp_cols, inplace=True, axis=1)
  table.drop(columns=drp_cols, inplace=True, axis=1)
  table.drop(columns=drp_cols, inplace=True, axis=1)
  table.drop(columns=drp_cols, inplace=True, axis=1)


In [230]:
debt_files = [ct_debt, pa_stelle_debt, pa_story_debt, ri_debt, loan_9_debt, marine_debt]

In [231]:
for file in debt_files:
    file['org_index'] = file.index

In [232]:
agg_debt = pd.concat(debt_files, ignore_index=True)

In [233]:
for column in agg_debt.columns:
    print(column)

letter
date of the certificate | month
date of the certificate | day
date of the certificate | year
to whom due | first name
to whom due | last name
to whom due | title
to whom due | first name.1
to whom due | last name.1
to whom due | title.1
time when the debt became due | month
time when the debt became due | day
time when the debt became due | year
amount | dollars
amount | 90th
line strike through? | yes?
line strike through? | note
notes
state
org_file
org_index
amount | 10th
exchange
amount in specie | dollars
amount in specie | cents
amount | 8th
delivered | month
delivered | day
delivered | year
total dollars | notes
total dollars | notes.1


In [234]:
agg_debt.to_csv('clean_cols/agg_debt_david.csv')

In [44]:
for column in ct_debt.columns:
    if 'unnamed' in column[1].lower():
        ct_debt.rename(columns={column:(column[0], '')}, inplace=True)
        print(column)
        
print(ct_debt.columns)

Index(['letter', 'date of the certificate | month',
       'date of the certificate | day', 'date of the certificate | year',
       'to whom due | first name', 'to whom due | last name',
       'to whom due | title', 'to whom due | first name.1',
       'to whom due | last name.1', 'to whom due | title.1',
       'time when the debt became due | month',
       'time when the debt became due | day',
       'time when the debt became due | year', 'amount | dollars',
       'amount | 90th', 'line strike thorugh? | yes?',
       'line strike thorugh? | note', 'line strike thorugh? | notes', 'state'],
      dtype='object')


In [6]:
print(nj_debt.columns)

Index(['date of the certificate | month', 'date of the certificate | day',
       'date of the certificate | year', 'to whom due | title',
       'to whom due | first name', 'to whom due | last name',
       'to whom due | title.1', 'person 2 (if two owners) | first name',
       'person 2 (if two owners) | last name',
       'time when the debt became due | day',
       'time when the debt became due | month',
       'time when the debt became due | year',
       'time when the debt became due | dollar ',
       'time when the debt became due | 90th', 'state'],
      dtype='object')


In [7]:
print(ny_debt.dtypes)

date of the certificate | month          float64
date of the certificate | day            float64
date of the certificate | year           float64
to whom due | first name                  object
to whom due | last name                   object
to whom due | title                       object
to whom due | first name.1                object
to whom due | last name.1                 object
to whom due | title.1                    float64
time when the debt became due | month    float64
time when the debt became due | day      float64
time when the debt became due | year      object
amount | dollars                         float64
amount | 90th                             object
state                                     object
dtype: object


## Company Names

There are multiple kinds of companies. 

```James Vernon & Co.``` These are pretty simple to deal with. If they have '& co' or '& others' anywhere in the string of the first name column, it is most likely a company. Just take the string beforehand. 

In [8]:
# dictionary of manual changes i have to make 
changes = {
    'Henry Mc Clellen & Henry & co' : 'Henry Mc Clellen & Co'
}

In [9]:
def handle_comp_name(row):    
    fname = row['to whom due | first name']
    
    if fname in changes:
        print(fname)
        fname = changes[fname]
    
    fname_c = str(fname).lower()
    if ('& co' in fname_c) or ('& others' in fname_c) or ('& several others' in fname_c):        
        fname_c = fname_c.replace('& co', '').replace('& others', '')
        name = HumanName(fname_c)
        row['to whom due | first name'] = name.first
        row['to whom due | last name'] = name.last
        row['under company'] = True # note that the original debt entry was held by a company 
        print(row)
        
        return row
    
    return row

ny_debt['under company'] = np.nan
ny_debt[['to whom due | first name', 'to whom due | last name', 'under company']] = ny_debt[['to whom due | first name', 
                                                                            'to whom due | last name', 'under company']].apply(lambda row: handle_comp_name(row), axis=1)

to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 491, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 492, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 493, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 494, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 495, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 496, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company                 True
Name: 497, dtype: object
to whom due | first name     henry
to whom due | last name     wisner
under company       

## Cleaning Entries with Two Names

There are debt entries that have two names in a single cell: ```NY_2422: Messes Williamson & Beckman```. The plan is to split the name across the first name and last name columns.  

In [10]:
changes = {
    'van zandt & kittletas' : ['', 'van zandt | kittletas'],
    'trustees of & davids church':['trustees of & davids church', '']
}

In [11]:
def handle_two_name(row):
    name = str(row['to whom due | first name']).lower()
    if (' & ' in name) or (' and ' in name):
        person1 = re.split('&|and', name)[0].strip()
        person2 = re.split('&|and', name)[1].strip()
        human_name_1 = HumanName(person1)
        human_name_2 = HumanName(person2)
        
        if name not in changes:
            if human_name_1.first != '' and human_name_2.first != '':
                row['to whom due | first name'] = human_name_1.first + " | " + human_name_2.first
            else: 
                row['to whom due | first name'] = human_name_1.first + human_name_2.first

            if human_name_1.last != '' and human_name_2.last != '':
                row['to whom due | last name'] = human_name_1.last + " | " + human_name_2.last
            else:
                row['to whom due | last name'] = human_name_1.last + human_name_2.last
        else:
            row['to whom due | first name'] = changes[name][0]
            row['to whom due | last name'] = changes[name][1]
        
        ny_debt['multiple persons'] = True
            
        print("old: " + name)
        print("new fn: " + row['to whom due | first name'])
        print("new ln: " + row['to whom due | last name'] +"\n")
        
    return row

ny_debt['multiple persons'] = np.nan
ny_debt.apply(lambda row: handle_two_name(row), axis=1)

old: harsen & ham
new fn: harsen | ham
new ln: 

old: harsen & ham
new fn: harsen | ham
new ln: 

old: messes smith & van buren
new fn: messes | van
new ln: smith | buren

old: messes bogart & van beuren
new fn: messes | van
new ln: bogart | beuren

old: messes smith & van buren
new fn: messes | van
new ln: smith | buren

old: messes smith & van buren
new fn: messes | van
new ln: smith | buren

old: messes williamson & beckman
new fn: messes | beckman
new ln: williamson

old: robinson & hale
new fn: robinson | hale
new ln: 

old: melgret & george ox
new fn: melgret | george
new ln: ox

old: robinson & hale
new fn: robinson | hale
new ln: 

old: robinson & hale
new fn: robinson | hale
new ln: 

old: robinson & hale
new fn: robinson | hale
new ln: 

old: robinson & hale
new fn: robinson | hale
new ln: 

old: bagart & dawrv
new fn: bagart | dawrv
new ln: 

old: quackenbush & dowe
new fn: quackenbush | dowe
new ln: 

old: whitbeck & fonda
new fn: whitbeck | fonda
new ln: 

old: van zandt &

Unnamed: 0,date of the certificate | month,date of the certificate | day,date of the certificate | year,to whom due | first name,to whom due | last name,to whom due | title,to whom due | first name.1,to whom due | last name.1,to whom due | title.1,time when the debt became due | month,time when the debt became due | day,time when the debt became due | year,amount | dollars,amount | 90th,state,under company,multiple persons
0,2.0,20.0,1784.0,John,Newkuk,,,,,4.0,20.0,1780,2.000000e+00,,ny,,
1,2.0,20.0,1784.0,Mathias,Teller,,,,,4.0,11.0,1780,8.000000e+00,87,ny,,
2,2.0,20.0,1784.0,Seth,Marvin,,,,,4.0,14.0,1780,1.800000e+01,,ny,,
3,2.0,20.0,1784.0,James,Gallaway,,,,,5.0,31.0,1780,2.800000e+01,80,ny,,
4,2.0,20.0,1784.0,Israel,Rogers,,,,,5.0,17.0,1780,1.500000e+01,,ny,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,3.0,31.0,1787.0,John,Myers,,,,,1.0,1.0,1777,2.190000e+02,33,ny,,
7307,3.0,31.0,1787.0,Luther,Halsey,,,,,1.0,1.0,1777,3.900000e+01,52,ny,,
7308,,,,,,,,,,,,,,,ny,,
7309,,,,,,,,,,,,,1.232992e+06,,ny,,


## Handle Abbreviations of a Name

There are individuals who have a handwritten abbreviation of a name in their debt entry. Fix these names. There will be a dictionary of abbreviations. Just check if any of the debt entries are in the dictionary and change it if needed. 

In [12]:
abbreviations = {
    'And':'Andrew', 'Ant':'Anthony', 'Bart':'Bartholomew', 'Cha':'Charles', 'Dor':'Dorothy', 'Dot':'Dorothy', 'Doth':'Dorothy',
    'Edw':'Edward', 'Eliz':'Elizabeth', 'Geo':'George', 'H':'Henry', 'Herb':'Herbert', 'Ja':'James', 'Jn':'John', 'Marg':'Margaret', 
    'Mich':'Michael', 'Pat': 'Patrick', 'Rich':'Richard', 'Tho':'Thomas', 'W':'William'
}

In [13]:
def handle_abbreviations(row):
    fn = str(row['to whom due | first name'])
    if fn in abbreviations:
        row['to whom due | first name'] = abbreviations[fn]
    
    return row

# test on new jersey dataset for now 
nj_debt.apply(lambda row: handle_abbreviations(row), axis=1)

Unnamed: 0,date of the certificate | month,date of the certificate | day,date of the certificate | year,to whom due | title,to whom due | first name,to whom due | last name,to whom due | title.1,person 2 (if two owners) | first name,person 2 (if two owners) | last name,time when the debt became due | day,time when the debt became due | month,time when the debt became due | year,time when the debt became due | dollar,time when the debt became due | 90th,state
0,12.0,23.0,1783.0,,Joseph,Milnor,,,,3.0,1.0,1782.0,217.000000,15.0,nj
1,12.0,23.0,1783.0,,Joseph,Milnor,,,,8.0,1.0,1781.0,1108.000000,60.0,nj
2,12.0,23.0,1783.0,,Joseph,Milnor,,,,2.0,1.0,1780.0,339.000000,6.0,nj
3,12.0,23.0,1783.0,,Furman & Hunt,,,,,7.0,15.0,1780.0,146.000000,66.0,nj
4,12.0,23.0,1783.0,,Furman & Hunt,,,,,1.0,30.0,1780.0,277.000000,48.0,nj
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,12.0,27.0,1786.0,,Peter,Wanamaker,,,,9.0,7.0,1779.0,5.000000,72.0,nj
5106,12.0,27.0,1786.0,,Richard,Wanamaker,,,,9.0,7.0,1779.0,4.000000,85.0,nj
5107,12.0,27.0,1786.0,Esq,Peter,Ward,,,,19.0,12.0,1781.0,7.000000,10.0,nj
5108,,,,,Amound Certificates Issued 917.966.74 D,,,,,,,,,,nj


## Standardizing Names

Multiple different spellings of a name can be referring to the same identity. We will use a phonetics library and Ancestry to fix this. 

In [14]:
# import necessary fuzzy string libraries 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.expected_conditions import element_to_be_clickable, presence_of_element_located
from selenium.webdriver.support.wait import WebDriverWait
from phonetics import metaphone
from fuzzywuzzy import fuzz
from jellyfish import soundex
import getpass
import time

In [15]:
# options
options = Options()
options.add_argument('--headless')
options.add_argument("--window-size=1000,1000")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--no-sandbox')   
options.add_argument(r'--user-data-dir=C:/Users/david/AppData/Local/Google/Chrome/User Data')

In [16]:
# install driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 30)

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.30M/6.30M [00:00<00:00, 37.6MB/s]


In [17]:
# login to emory ancestry 
driver.get('https://guides.libraries.emory.edu/ALE')
login_btn_xpath = '/html/body/main/div/div/div/a'
wait.until(element_to_be_clickable((By.XPATH, login_btn_xpath))).click()

# input login information and click 'login'
netid_xpath = '/html/body/div[1]/div[2]/section/div[1]/div/form/fieldset/div[1]/input'
password_xpath = '/html/body/div[1]/div[2]/section/div[1]/div/form/fieldset/div[2]/input'
username = input('username: ')
password = getpass.getpass(prompt='password: ')
netid_input = wait.until(element_to_be_clickable((By.XPATH, netid_xpath)))
netid_input.click()
netid_input.send_keys(username)
pass_input = wait.until(element_to_be_clickable((By.XPATH, password_xpath)))
pass_input.click()
pass_input.send_keys(password)

login_btn_xpath = '/html/body/div[1]/div[2]/section/div[1]/div/form/fieldset/div[3]/button'
wait.until(element_to_be_clickable((By.XPATH, login_btn_xpath))).click()
time.sleep(1)

driver.get('https://www.ancestrylibrary.com/search/collections/5058/')

username: dcho52
password: ········


In [23]:
abbrev_to_us_state = {
    'NY':'New York'
}
    
# invert the dictionary

In [28]:
def access_ancestry(fn1, ln1, fn2, ln2, state):
    name1 = fn1 + ' ' + ln1
    name2 = fn2 + ' ' + ln2
    
    driver.get('https://www.ancestrylibrary.com/search/collections/5058/?name=' + fn1 + '_' + ln1 + '&residence=_new+york-usa_35&residence_x=_1-0')
    
    result_fn1 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span/span[1]'))).text
    result_ln1 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span/span[2]'))).text
    result1 = result_fn1 + ' ' + result_ln1
    print('first ancestry result: ' + result1)
    print(driver.current_url)
    
    driver.get('https://www.ancestrylibrary.com/search/collections/5058/?name=' + fn2 + '_' + ln2 + '&residence=_new+york-usa_35&residence_x=_1-0')
    print(driver.current_url)

    # result_fn2 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span/span[1]'))).text
    # result_ln2 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span/span[2]'))).text
    result2 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span'))).text
    
    print('second ancestry result: ' + result2) 
    
    if result1 == name1 and result2 == name1:
        print('result1 == name1 and result2 == name1')
    elif result1 == name2 and result2 == name2:
        print('result1 == name2 and result2 == name2')

def only_f(fn, ln, crow):
    if crow['to whom due | first name'][0] == fn[0] and crow['to whom due | last name'][0] == ln[0]:
        return crow

def fuzzy_similarity(fn, ln, row):
    name = fn + ' ' + ln
    cname = row['to whom due | first name'] + ' ' + row['to whom due | last name']
    code1 = metaphone(name)
    code2 = metaphone(cname)
    ratio = fuzz.ratio(name, cname)
    score = fuzz.ratio(code1, code2)
    if score > 90 and ratio > 90 and name != cname:
        print('Name: ' + name)
        print('Cname: ' + cname)
        print('M Score: ' + str(score))
        print('Ratio: ' + str(ratio))
        access_ancestry(row['to whom due | first name'], row['to whom due | last name'], 
                        fn, ln, abbrev_to_us_state[row['state'].upper()])
        print('--------------------------------------------------')
        # print('Name=' + name + ' - CName=' + cname + ' - M Score=' + str(score) + )
    
def determine_similarities(df, row):
    fn = row['to whom due | first name']
    ln = row['to whom due | last name'] 
    onlyfc_df = df.apply(lambda row: only_f(fn, ln, row), axis=1).dropna()
    
    if len(onlyfc_df) > 0:
        onlyfc_df.apply(lambda row: fuzzy_similarity(fn, ln, row))
    # print("first name=" + fn + " - last name=" + ln + " - length=" + str(len(onlyfc_df)))
    
ny_debt['to whom due | first name'] = ny_debt['to whom due | first name'].astype(str)
ny_debt['to whom due | last name'] = ny_debt['to whom due | last name'].astype(str)
ny_debt.apply(lambda row: determine_similarities(ny_debt, row), axis=1)

Name: James Gallaway
Cname: James Galloway
M Score: 100
Ratio: 93
first ancestry result: James Galloway
https://www.ancestrylibrary.com/search/collections/5058/?name=James_Galloway&residence=_new+york-usa_35&residence_x=_1-0
https://www.ancestrylibrary.com/search/collections/5058/?name=James_Gallaway&residence=_new+york-usa_35&residence_x=_1-0
second ancestry result: James Galloway
result1 == name1 and result2 == name1
--------------------------------------------------
Name: James Gallaway
Cname: James Galloway
M Score: 100
Ratio: 93
first ancestry result: James Galloway
https://www.ancestrylibrary.com/search/collections/5058/?name=James_Galloway&residence=_new+york-usa_35&residence_x=_1-0
https://www.ancestrylibrary.com/search/collections/5058/?name=James_Gallaway&residence=_new+york-usa_35&residence_x=_1-0
second ancestry result: James Galloway
result1 == name1 and result2 == name1
--------------------------------------------------
Name: Nathniel Tuttle
Cname: Nathaniel Tulttle
M Sco

KeyboardInterrupt: 