## Ancestry Seach 

<b>Goal: </b>Multiple different spellings of a name can be referring to the same identity. We will use a phonetics library and Ancestry to fix this. An example: ```David Schaffer``` and ```David Schafer``` from `MA`. 

<b>Steps: </b>
1. Login to Emory's Ancestry subscription 
2. Iterate through ```agg_debt```, through each debt entry. 
3. Use a combination of phonetics fuzzy string matching and normal fuzzy string matching to determine if two names from a state are similar.  
4. Search each name in Ancestry: Edit URL (state and person's name). 
5. Check if there are any results for both person's name:
    - Yes: Check if one spelling of the name appears for both individuals (that's most likely the correct spelling of that name) 
    - No: Leave entries as two separate entries. 
6. Record name change in ```fixes``` list (save ```fixes``` as ```out.csv``` too). 
7. Run ```agg_debt``` through ```fixes```, making changes as necessary. 
8. Save ```agg_debt``` as a new .csv file.

<b style="color: red;">Note: Runtime is long. This is due to the fact there are over 200,000 debt entries and accessing Ancestry takes time too. </b>

In [None]:
# import necessary fuzzy string libraries 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.expected_conditions import element_to_be_clickable, presence_of_element_located
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from phonetics import metaphone
from rapidfuzz import fuzz
from joblib import Parallel, delayed, cpu_count
from itertools import zip_longest
import time 
import getpass
import pickle
import pandas as pd
import numpy as np
import ast

In [None]:
agg_debt = pd.read_csv('data/agg_debt_grouped.csv')
agg_debt['to whom due | first name'] = agg_debt['to whom due | first name'].astype(str)
agg_debt['to whom due | last name'] = agg_debt['to whom due | first name'].astype(str)

In [None]:
agg_debt['to whom due | first name'] = agg_debt['to whom due | first name'].astype(str)
agg_debt['to whom due | last name'] = agg_debt['to whom due | last name'].astype(str)

In [None]:
name_changes = pd.read_csv('data/name_changes_david.csv')

In [None]:
agg_debt.head()

In [None]:
# check to make sure pierce certificate first name and last name columns are properly swapped
agg_debt.loc[agg_debt['org_file'] == 'Pierce_Certs_cleaned_2019.xlsx'].head()

In [None]:
name_changes.tail()

In [None]:
# options
options = Options()
options.add_argument('--headless')
options.add_argument("--window-size=1000,1000")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--no-sandbox')   

In [None]:
agg_debt.state.unique()

In [None]:
# voter records and censuses available for every state 
records = {
    'nh':['https://www.ancestrylibrary.com/search/collections/5058/'],
    'nj':['https://www.ancestrylibrary.com/search/collections/2234/', 
          'https://www.ancestrylibrary.com/search/collections/3562/'],
    'ny':['https://www.ancestrylibrary.com/search/collections/5058/'],
    'ma':['https://www.ancestrylibrary.com/search/collections/5058/'], 
    'ct':['https://www.ancestrylibrary.com/search/collections/5058/'], 
    'va':['https://www.ancestrylibrary.com/search/collections/2234/', 
         'https://www.ancestrylibrary.com/search/collections/3578/'], 
    'pa':['https://www.ancestrylibrary.com/search/collections/2702/',
         'https://www.ancestrylibrary.com/search/collections/2234/',
         'https://www.ancestrylibrary.com/search/collections/3570/'],
    'md':['https://www.ancestrylibrary.com/search/collections/3552/'],
    'nc':['https://www.ancestrylibrary.com/search/collections/3005/', 
         'https://www.ancestrylibrary.com/search/collections/2234/'],
    'ga':['https://www.ancestrylibrary.com/search/collections/2234/'],
    'ri':['https://www.ancestrylibrary.com/search/collections/3571/']
}

# ancestry has unique urls for each state
residence_urls = {
    'nh':'_new+hampshire-usa_32',
    'nj':'_new+jersey-usa_33', 
    'ny':'_new+york-usa_35',
    'ma':'_massachusetts-usa_24',
    'ct':'_connecticut-usa_9',
    'va':'_virginia-usa_49', 
    'pa':'_pennsylvania-usa_41',
    'md':'_maryland-usa_23',
    'nc':'_north+carolina-usa_36',
    'ga':'_georgia-usa_13',
    'ri':'_rhode+island-usa_42'
}

In [None]:
# remove 'cs' (congress) and 'f' (foreign officers); these are not state, but specific regiments / types of officers
agg_debt_copy = agg_debt[(agg_debt['state'] != 'cs') & (agg_debt['state'] != 'f') & (agg_debt['state'] != 'de')]

# split dataframe based on state; makes searching faster
agg_debt_sp = agg_debt_copy.groupby('state')
agg_debts_st = [agg_debt_sp.get_group(x) for x in agg_debt_sp.groups]


for x in agg_debt_sp.groups:
    print(x)

In [None]:
netid_xpath = '/html/body/div[1]/div[2]/section/div[1]/div/form/fieldset/div[1]/input'
password_xpath = '/html/body/div[1]/div[2]/section/div[1]/div/form/fieldset/div[2]/input'
login_btn0_xpath = '/html/body/main/div/div/div/a'
login_btn1_xpath = '/html/body/div[1]/div[2]/section/div[1]/div/form/fieldset/div[3]/button'

# ask for password and username 
username = input('username: ')
password = getpass.getpass(prompt='password: ')

driver_objs = {}
# create a new driver object for each state
for st in agg_debt_sp.groups:
    driver_objs[st] = [webdriver.Chrome(service=Service(executable_path='chromedriver.exe'), options=options)]
# create a new wait object for each state
for st in agg_debt_sp.groups:
    webdriver_obj = driver_objs[st][0]
    driver_objs[st].append(WebDriverWait(webdriver_obj, 30))
# for each driver obj: access emory's ancestry's subscription 
for st in agg_debt_sp.groups:
    webdriver_obj = driver_objs[st][0]
    wait_obj = driver_objs[st][1]
    
    # go to emory's library 
    webdriver_obj.get('https://guides.libraries.emory.edu/ALE')
    wait_obj.until(element_to_be_clickable((By.XPATH, login_btn0_xpath))).click()
    
    # input login information and click 'login'
    netid_input = wait_obj.until(element_to_be_clickable((By.XPATH, netid_xpath)))
    netid_input.click()
    netid_input.send_keys(username)
    pass_input = wait_obj.until(element_to_be_clickable((By.XPATH, password_xpath)))
    pass_input.click()
    pass_input.send_keys(password) 
    wait_obj.until(element_to_be_clickable((By.XPATH, login_btn1_xpath))).click()
    time.sleep(1)
    
    webdriver_obj.get('https://www.ancestrylibrary.com/search/collections/5058/')
    print(webdriver_obj.current_url)
    
    print(webdriver_obj)

## New Ancestry Code

In [None]:
'''
ancestry_name_changes = {}
rerun_rows = []
checked0 = []
'''

In [None]:
%store -r ancestry_name_changes
%store -r rerun_rows
%store -r checked0

In [None]:
def access_ancestry(row0, state, driver, wait):
    fn0 = row0['to whom due | first name']
    ln0 = row0['to whom due | last name'] 
    matches = ast.literal_eval(row0['matches'])

    if (fn0, ln0, state) not in checked0:
        print(matches)
        for match in matches:
            row1 = agg_debt.loc[[match[2]]] 
            fn1 = row1['to whom due | first name'].values[0]
            ln1 = row1['to whom due | last name'].values[0]
            search_ancestry(fn0, ln0, fn1, ln1, row0, row1, state, driver, wait)
        checked0.append((fn0, ln0, state))
        %store checked0

In [None]:
# Look up both names in Ancestry's database
def search_ancestry(fn0, ln0, fn1, ln1, row0, row1, state, driver, wait):
    name0 = str(fn0) + ' ' + str(ln0)
    name1 = str(fn1) + ' ' + str(ln1)
    
    # Loop through state urls 
    for url in records[state]:        
        try:
            # Search person 0
            url0 = url + '?name=' + fn0 + '_' + ln0 + '&name_x=ps&residence=1780' + residence_urls[state] + '&residence_x=10-0-0_1-0'
            driver.get(url0) 
            # results were found for person0
            try:
                # use class_name to find result text
                result0 = wait.until(presence_of_element_located((By.CLASS_NAME, 'srchHit'))).text
            # no results were found; keep entries separate  
            except:
                result0 = ''
            
            # search person 1
            url1 = url + '?name=' + fn1 + '_' + ln1 + '&name_x=ps&residence=1780' + residence_urls[state] + '&residence_x=10-0-0_1-0'
            driver.get(url1) 
            # results were found for person1
            try: 
                # use class_name to find result text
                result1 = wait.until(presence_of_element_located((By.CLASS_NAME, 'srchHit'))).text
            # no results were found; keep entries separate
            except:
                result1 = ''    

            '''
            compare results:
            if both results are empty, do not add to fixes dict 
            if both results are different, do not add to fixes dict
            if both results are the same, add to fixes dict
                find correct name
                if name0 = result0 and result1 : {name1 : name0}
                if name1 = result1 and result0 : {name0 : name0} 
            '''
            print('---------------------------+')
            if result0 == result1 and result0 != '' and result1 != '':
                if name0 == result0 and name0 == result1: # name0 must be the correct version of the name 
                    # record change
                    title1 = row1['to whom due | title']
                    org_file1 = row1['org_file']
                    org_index1 = row1['org_index'] 
                    ancestry_name_changes.append([title1, title1, fn1, ln1, fn0, ln0, 6, org_file1, org_index1, state])
                    %store ancestry_name_changes

                elif name1 == result0 and name1 == result1: # name1 must be the correct version of the name 
                    # record change
                    title0 = row0['to whom due | title']
                    org_file0 = row0['org_file']
                    org_index0 = row0['org_index'] 
                    ancestry_name_changes.append([title0, title0, fn0, ln0, fn1, ln1, 6, org_file0, org_index0, state])
                    %store ancestry_name_changes

            print('Summary')
            print('name0=' + str(name0))
            print('name1=' + str(name1))
            print('url-0=' + str(url0))
            print('url-1=' + str(url1))
            print('result0=' + str(result0))
            print('result1=' + str(result1))
            print('state=' + str(state))

            '''
            print('fn0=' + str(fn0))
            print('ln0=' + str(ln0))
            print('fn1=' + str(fn1))
            print('ln1=' + str(ln1))
            '''
            print('index=' + str(row0['index']))
            print('rerun_rows length=' + str(len(rerun_rows)))
            print('name_changes length=' + str(len(ancestry_name_changes)))
            '''
            print('file_name0=' + str(org_file0))
            print('file_name1=' + str(org_file1))
            '''
            print('---------------------------+')
        
        # there was error trying to access ancestry's records
        except Exception as e:
            print('---------------------------+')
            print('Error')
            print(e)
            '''
            print('name0=' + str(name0))
            print('name1=' + str(name1))
            print('title1=' + str(type(title1)))
            print('title0=' + str(type(title0)))
            print('title1=' + str(title1))
            print('title0=' + str(title0))
            print('org_file1=' + str(org_file1))
            print('org_file0=' + str(org_file0))
            print('index=' + str(index))
            '''
            print('---------------------------+')
            rerun_rows.append([fn0, ln0, fn1, ln1, name0, name1, state]) 
            %store rerun_rows

In [None]:
def ancestry_wrap(similar_names, state):    
    similar_names.apply(lambda row0: access_ancestry(row0, state, driver_objs[state][0], driver_objs[state][1]), axis=1) 

similar_names_dfs = {}
for state in agg_debt_sp.groups:
    similar_names_dfs[state] = pd.read_csv('data/similar names/similar_names_' + state + '.csv') 

# Initialize a parallelization job 
ancestry_calls = [delayed(ancestry_wrap)(similar_names_dfs[state], state) for state in agg_debt_sp.groups]
Parallel(n_jobs=3, backend="threading")(ancestry_calls) 

# Operate without parallelization 
# ancestry_wrap('ct')

## Old Ancestry Code

In [None]:
def ancestry_cleaning(agg_debt_st, state):
    # retrieve selenium chromedrivers associated with that state
    st_driver = driver_objs[state][0]
    wait_driver = driver_objs[state][1]
    # run ancestry search on agg debt file
    agg_debt_clean = compare_strings_vect(agg_debt_st.index, agg_debt_st['to whom due | title'], agg_debt_st['to whom due | first name'].astype(str), agg_debt_st['to whom due | last name'].astype(str), 
                                          agg_debt_st['org_index'], agg_debt_st['org_file'], state, st_driver, wait_driver) # using vectorization

In [None]:
%%capture
# loop through the state agg_debt one more time; compare row0 (original row) with all the other rows (row1)
def compare_strings(index, title0, fn0, ln0, org_index0, org_file0, state, st_driver, wait_driver):
    # make sure we haven't checked this name before (handles people who share the same fn & ln & live in same state) 
    # name0 = row0['to whom due | first name'] + ' ' + row0['to whom due | last name'] # uncomment when using apply
    agg_debt_st = agg_debt_sp.get_group(state)
    
    if (fn0, ln0, state) not in checked0:
        # compare both strings 
        # agg_debt_st.swifter.apply(lambda row1: fuzzy_comparison(fn0, ln0, row1['to whom due | first name'], row1['to whom due | last name'], state, row0, row1), axis=1) # using apply
        fuzzy_comparison_vect(index, title0, fn0, ln0, org_index0, org_file0, agg_debt_st['to whom due | title'],
                              agg_debt_st['to whom due | first name'].astype(str), agg_debt_st['to whom due | last name'].astype(str), 
                              agg_debt_st['org_index'], agg_debt_st['org_file'], state, st_driver, wait_driver) # using vectorization
        checked0.append((fn0, ln0, state))
        %store checked0

In [None]:
%%capture
# compare two strings using fuzzy string matching 
def fuzzy_comparison(index, title0, fn0, ln0, org_index0, org_file0, title1, fn1, ln1, org_index1, org_file1, 
                     state, st_driver, wait_driver):
    if (fn1, ln1, state) not in checked1:
        
        name0 = fn0 + ' ' + ln0
        name1 = fn1 + ' ' + ln1

        # use phonetic similarity (compares similar sounding names)
        meta0 = metaphone(name0.lower()) 
        meta1 = metaphone(name1.lower())
        phonetic_score = fuzz.ratio(meta0, meta1)

        # use fuzzy string similarity (compares similar spellings between names)
        fuzz_score = fuzz.ratio(name0, name1) 

        # check if phonetic score and fuzzy string score both meet threshold, both names are not the same  
        if phonetic_score > 90 and fuzz_score > 90 and name0 != name1:
            search_ancestry(index, title0, fn0, ln0, org_index0, org_file0, title1, fn1, ln1, org_index1, org_file1, name0, 
                            name1, state, st_driver, wait_driver) 
            checked1.append((fn1, ln1, state)) # record that we have checked this name
            %store checked1

In [None]:
# look up both names in ancestry's database
def search_ancestry(index, title0, fn0, ln0, org_index0, org_file0, title1, fn1, ln1, org_index1, org_file1, name0, name1, 
                    state, driver, wait):
    # loop through state urls 
    for url in records[state]:        
        try:
            # search person-0
            url0 = url + '?name=' + fn0 + '_' + ln0 + '&name_x=ps&residence=1780' + residence_urls[state] + '&residence_x=10-0-0_1-0'
            driver.get(url0) 
                
            # results were found for person0
            try:
                # use xpath to find result text
                # result0 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span'))).text
                # use class_name to find result text
                result0 = wait.until(presence_of_element_located((By.CLASS_NAME, 'srchHit'))).text
            # no results were found; keep entries separate  
            except:
                result0 = ''
            
            # search person1
            url1 = url + '?name=' + fn1 + '_' + ln1 + '&name_x=ps&residence=1780' + residence_urls[state] + '&residence_x=10-0-0_1-0'
            driver.get(url1)
                        
            # results were found for person1
            try: 
                # use xpath to find result text
                # result1 = wait.until(presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/div/section[1]/div[1]/table/tbody/tr[2]/td[2]/span'))).text
                # use class_name to find result text
                result1 = wait.until(presence_of_element_located((By.CLASS_NAME, 'srchHit'))).text
            # no results were found; keep entries separate
            except:
                result1 = ''
            
            '''
            compare results:
            if both results are empty, do not add to fixes dict 
            if both results are different, do not add to fixes dict
            if both results are the same, add to fixes dict
                find correct name
                if name0 = result0 and result1 : {name1 : name0}
                if name1 = result1 and result0 : {name0 : name0} 
            '''
            print('---------------------------+')
            if result0 == result1 and result0 != '' and result1 != '':
                if name0 == result0 and name0 == result1: # name0 must be the correct version of the name 
                    # fixes[state][(fn1, ln1, name1)] = (fn0, ln0, name0) # convert name1 to name0  
                    # record change
                    ancestry_name_changes.append([title1, title1, fn1, ln1, fn0, ln0, 6, org_file1, org_index1, state])
                    %store ancestry_name_changes

                    
                elif name1 == result0 and name1 == result1: # name1 must be the correct version of the name 
                    # fixes[state][(fn0, ln0, name0)] = (fn1, ln1, name1) # convert name0 to name1
                    # record change
                    ancestry_name_changes.append([title0, title0, fn0, ln0, fn1, ln1, 6, org_file0, org_index0, state])
                    %store ancestry_name_changes

            print('Summary')
            print('name0=' + str(name0))
            print('name1=' + str(name1))
            print('fn0=' + str(fn0))
            print('ln0=' + str(ln0))
            print('fn1=' + str(fn1))
            print('ln1=' + str(ln1))
            print('url-0=' + str(url0))
            print('url-1=' + str(url1))
            print('result0=' + str(result0))
            print('result1=' + str(result1))
            print('state=' + str(state))
            print('rerun rows length=' + str(len(rerun_rows)))
            print('name changes length=' + str(len(ancestry_name_changes)))
            print('index=' + str(index))
            print('file_name0=' + str(org_file0))
            print('file_name1=' + str(org_file1))
            print('---------------------------+')
        
        # there was error trying to access ancestry's records
        except Exception as e:
            print('---------------------------+')
            print('Error')
            print(e)
            print('name0=' + str(name0))
            print('name1=' + str(name1))
            print('title1=' + str(type(title1)))
            print('title0=' + str(type(title0)))
            print('title1=' + str(title1))
            print('title0=' + str(title0))
            print('org_file1=' + str(org_file1))
            print('org_file0=' + str(org_file0))
            print('index=' + str(index))
            print('---------------------------+')
            rerun_rows.append([fn0, ln0, fn1, ln1, name0, name1, state]) 
            %store rerun_rows

In [None]:
# record how long it takes to run ancestry search; useful information to see effectiveness of different methods 

# vectorize our functions 
compare_strings_vect = np.vectorize(compare_strings)
fuzzy_comparison_vect = np.vectorize(fuzzy_comparison)

# initialize a parallelization job; the idea is to have one core work on one state's agg debt file
ancestry_calls = [delayed(ancestry_cleaning)(agg_debt_sp.get_group(st), st) for st in agg_debt_sp.groups]
%timeit results = Parallel(n_jobs=-1, backend="threading")(ancestry_calls) 