In [88]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import numpy as np
from selenium.webdriver.common.keys import Keys

In [89]:
user = "Chris" # Miguel

## Preprocess New York Data

In [90]:
statedict = {'PA':'Pennsylvania', 'CT':'Connecticut', 'MA':'Massachusetts', 'NH':'New Hampshire', 'DE':'Delaware',
             'NC':'North Carolina', 'GA':'Georgia', 'NY':'New York', 'NJ': 'New Jersey', 'RI':'Rhode Island',
             'VA':'Virginia', 'MD':'Maryland', 'SC':'South Carolina', 'VT':'Vermont'}

In [91]:
def deNaN(series):
    return series.apply(lambda x: "" if pd.isnull(x) else x)

In [92]:
CD_merged = pd.read_csv("../../Data/Post1790/Aggregated/aggregated_CD_final.csv")
CD_merged['Name'] = CD_merged['Name'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))

In [93]:
# number of names
np.sum(CD_merged['Name'].apply(lambda x: len(x) > 0))

4377

In [94]:
# remove duplicated names
names = CD_merged['Name'][~CD_merged['Name'].astype(str).duplicated(keep='first')]
names = names[names.apply(lambda x: x[0] != '')]
names = names[names.apply(lambda x: len(list(x)) > 0)]

In [95]:
# create name df
CD_merged2 = CD_merged.copy()
CD_merged2['Name_str'] = CD_merged2['Name'].astype(str)
namedict = dict(zip(CD_merged2['Name_str'], CD_merged2['Name']))
names = CD_merged2[['Name_str', 'town', 'county', 'state', 'name_type']].drop_duplicates()
names['Name'] = names['Name_str'].apply(lambda x: namedict[x])
names.drop('Name_str', axis = 1, inplace = True)
names = names[names['Name'].apply(lambda x: len(x) != 0)]

In [96]:
# cleaning names - identify names to be cleaned
names_fix = names['Name'].copy()
names_fix = names_fix.apply(lambda set: {el.lower().replace("and sons", "").replace("and son", "").replace("and co", "").replace("and others", "").replace(" mpany", "").replace(" mpaney", "").replace('as guardian','').strip().title().replace("And", "and") for el in set})

name_fix_inds = names_fix[names_fix.astype(str).apply(lambda x:  ' and ' in  x.lower() or
                                                                 ' of ' in  x.lower() or
                                                                 'treas' in x.lower() or
                                                                 ' to ' in x.lower() or
                                                                 'adm ' in x.lower() or
                                                                 ' exec ' in x.lower() or
                                                                 'agents' in x.lower() or
                                                                 ' no ' in x.lower() or
                                                                 ' comm' in x.lower() or
                                                                 ' for ' in x.lower())].index
names_fix.loc[name_fix_inds]

18                             {John and James Davenport}
113                         {Eunice and Betsey Wadsworth}
114                       {Joshua Belden and James Wells}
118     {Wheeler Coit, Wheeler Coit Treasurer Of Prest...
155     {Nehemiah Child, Nehemiah Child Society Treasu...
                              ...                        
4191             {Henry Laurens and Arnoldus Vanderhorst}
4254                               {Wadsworth and Turpin}
4330                        {The State Of South Carolina}
4342                        {Love Stone and Joseph Vesey}
4357                                   {Cross and Crowly}
Name: Name, Length: 203, dtype: object

In [97]:
# split two people with same last name
def simpleSplit(name):
    name2 = name.split(" and ")[1]
    lname2 = name2.split(" ")[-1] if "Van" not in name2 else " ".join(name.split(" ")[-2:])
    name1 = name.split(" and ")[0] + " " + lname2
    return set([name1, name2])

simplefix = ['John and James Davenport','Eunice and Betsey Wadsworth', 'Daniel and Elijah Boardman', 'Samuel and Timothy Burr',
             'Michael and Thomas Bull','Elias and Jeremiah Cowles', 'Amasa and Elnathan Keyes','Horace and Seth Johnson',
             'Richard and James Potter','Elizabeth and John Grover', 'Nicholas and Hannah Cooke', 'Moses and Nicholas Brown',
             'Samuel and Charles Sampson', 'John and Hugh Irvin', 'Jonathan and Mariamne Williams']
names_fix = names_fix.apply(lambda x: simpleSplit(list(x)[0]) if list(x)[0] in simplefix else x)

In [98]:
# split two people separated by "and"
def simpleSplit2(name):
    try:
        name2 = name.split(" and ")[1]
        name1 = name.split(" and ")[0]
    except:
        name2 = name.split(" And ")[1]
        name1 = name.split(" And ")[0]
    return set([name1, name2])

simplefix2 = ['Joshua Belden and James Wells','William Joseph and Richard Hart','Anthony Bradford and Stephen Hall', 'John Dodd and John Porter',
              'Uriah Forrest and Benjamin Stoddert','Samuel John and Thomas Snowden',
              'John Laird and Thomas Dick','James Boyd and Jonathan B Smith', 'Nathan Waterman and Robert Newell', 'Joseph Jenckes and David L Barnes',
              'Henry Laurens and Arnoldus Vanderhorst','Love Stone and Joseph Vesey', 'Charles Stuart and James Mcculloch']
names_fix = names_fix.apply(lambda x: simpleSplit2(list(x)[0]) if list(x)[0] in simplefix2 else x)

In [99]:
# fix people with "Treasurer in name
t_ind = names_fix[names_fix.astype(str).apply(lambda x: 'treasurer' in x.lower() and 'cincinnati' not in x.lower())].index
newvals = names_fix.loc[t_ind].apply(lambda x: set(list([ele.replace('Society', '').replace('Proprietors', '').split("Treasurer")[0].strip() for ele in x])))
names_fix.loc[t_ind] = newvals

In [100]:
# fix people with "transfer" and "of"" in name
trans_ind = names_fix[names_fix.apply(lambda x: all(["transfer" in ele.lower() and "from" in ele.lower() for ele in list(x)]))].index
newvals = names_fix.loc[trans_ind].apply(lambda x: set([ele.lower().split("transfer")[0].strip().title() for ele in list(x)]))
newvals = newvals.apply(lambda x: set([ele.replace(" And","").split("In Trust")[0].strip() for ele in list(x)]))
names_fix.loc[trans_ind] = newvals

In [101]:
# fix people with "guardian" in name
guard_ind = names_fix[names_fix.astype(str).apply(lambda x: "guard" in x.lower())].index
newvals = names_fix.loc[guard_ind].apply(lambda x: set([ele.lower().split("guard")[0].strip().title() for ele in list(x)]))
names_fix.loc[guard_ind] = newvals

In [102]:
# fix people with "administrator" in name
adm_ind = names_fix[names_fix.astype(str).apply(lambda x: "adm" in x.lower())].index
newvals = names_fix.loc[adm_ind].apply(lambda x: set([ele.lower().split("adm")[0].strip().title() for ele in list(x)]))
newvals.loc[[3940, 3941, 3943]] = [set(['Stephen Davis']),
                                     set(['James Manning', 'Phillip N Brown', 'Stephen Davis']),
                                     set(['James Burrill', 'John Brown', 'Mehitable Davis'])]
names_fix.loc[adm_ind] = newvals

In [103]:
# fix people with "of" in name
of_ind = names_fix[names_fix.astype(str).apply(lambda x: " of " in x.lower() and "state" not in x.lower()
                                                         and "the" not in x.lower() \
                                                         and "bank" not in x.lower()
                                                         and "town" not in x.lower())].index
newvals = names_fix.loc[of_ind].apply(lambda x: set([ele.lower().replace("execr", "").replace("excr", "").replace("executor", "") for ele in list(x)]))
newvals = newvals.apply(lambda x: set([ele.lower().split(" of ")[0].replace("(","").replace("son", "").strip().title() if "robert pinkney" not in ele.lower() else "Johnathan Pinkney" for ele in list(x)]))
names_fix.loc[of_ind] = newvals

In [104]:
# fix people with "school" in name
s_ind = names_fix[names_fix.astype(str).apply(lambda x: "school" in x.lower() and "com" in x.lower())].index
newvals = names_fix.loc[s_ind].apply(lambda x: set([ele.lower().split("school")[0].replace("hon", "").replace("society committee", "").strip().title() for ele in list(x)]))
names_fix.loc[s_ind] = newvals

In [105]:
# change names for companies and other changes - warning - this is long
company_names = [['{Clark and Nightingale}', set(['Joseph Nightingale', 'Joseph Innes Clark'])],
                 ['{Jon and Jacob Starr, Jonathan and Jared Starr}', set(['Jonathan Starr', 'Jacob Starr'])],
                 ['{John Williams Of Wethersfield, John Williams}', set(['John Williams'])],
                 ['{Wallace and Muir}', set(['Charles Wallace', 'John Muir'])],
                 ['{Zealor E Fisher, Henry Zealor and Margaret Fisher}', set(['Zealor E Fisher', 'Henry Zealor', 'Margaret Fisher'])],
                 ['{Wallace Muir, Wallace and Muir}', set(['Charles Wallace', 'John Muir'])],
                 ['{The Estate Of John Jordon, John Jordon, John Jordans Estate}', set(['John Jordon'])],
                 ['{Lawrence Brangle and Jacob Gombar, Lawrence Brangle and Jacob Gombare, Lawrance Brangle and Jacob Gombare}', set(['Lawrence Brangle', 'Jacob Gombar', 'Jacob Gombare'])],
                 ['{Thomas and Benjamin Harwood, William Harwood}', set(['Thomas Harwood', 'Benjamin Harwood', 'William Harwood'])],
                 ['{Thomas and Benjamin Harwood, Thomas Harwwod and Benjamin Harwood}', set(['Thomas Harwood', 'Benjamin Harwood'])],
                 ['{Wallce and Muir, Wallace and Muir}', set(['Charles Wallace', 'John Muir'])],
                 ['{Stewart and Plunkett, Stewart Plunkett, Stewart E Plunkett}', set(['David Stewart', 'David Plunkett'])],
                 ['{The Estate Of General Otto H Williams}', set(['Otto H Williams'])],
                 ['{Hartshorne and Lindley}', set(['William Hartshorne', 'Joseph Lindley'])],
                 ['{Willing Morris and Swanwick, William Morris and Swanwick}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{Mccrea and Mease}', set(['Robert McCrea', 'Robert Mease'])],
                 ['{Thomas Mifflin  Ex, His Executor Thomas Mifflin Francis Johnston David Lenox Charles Smith Ex To Ex Lawrence Keene, Thomas Mifflin and Francis Johnston David Lenax and Charles Smith}', set(['Thomas Mifflin', 'Francis Johnston', 'David Lenox', 'David Lenax', 'Charles Smith', 'Lawrence Keene'])],
                 ['{Morris and Swanwick Willing, William Morris and Swanwick}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{Wharton and Lewis}', set(['Isaac Wharton', 'David Lewis'])],
                 ['{Bernard and Malfeson}', set(['Bernard Malfeson'])],
                 ['{James Johnston John Mcdowell and Samuel Strawbridge, James Johnston John Mcdowell and James Strawbridge}', set(['James Johnston', 'John McDowell', 'James Strawbridge', 'Samuel Strawbridge'])],
                 ['{William Sherer and John Mcdowell, William Sherer and Mcdowell}', set(['William Sherer', 'John McDowell'])],
                 ['{Jennet Grier James Riddle and Henry Miller, Jennet Grier James Riddle}', set(['Jennet Grier', 'James Riddle', 'Henry Miller'])],
                 ['{George Leib, George Leib Of The N L}', set(['George Leib'])],
                 ['{Josiah Lockhart Treasury, Josiah Lockhart}', set(['Josiah Lockhart'])],
                 ['{Alexander Fullerton, Alexander Fullerton and William Honeyman Deceased}', set(['Alexander Fullerton', 'William Honeyman'])],
                 ['{Catharine and William Coleman Executor Of Jacob Coleman Deceased, Catherine and William, Cath and William Coleman}', set(['Catherine Coleman', 'Catharine Coleman', 'William Coleman'])],
                 ['{Thomas Folwell And Joseph Hart, Thomas Folwell And Jo Hart}', set(['Thomas Folwell', 'Joseph Hart'])],
                 ['{Rebecca and Deborah Wharton, Rebecca Wharton and Deborah Wharton}', set(['Rebecca Wharton', 'Deborah Wharton'])],
                 ['{andrew Clow  Agents To James Brown Executor To The Est Of Hugh Patton, andrew Clow, andrew Clow  Agents To James Brown}', set(['Andrew Clow', 'James Brown', 'Hugh Patton'])],
                 ['{Reed and Forde, Reide and Forde}', set(['John Reed', 'Standish Forde'])],
                 ['{John Butler, John Butler From Treasury}', set(['John Butler'])],
                 ['{Reed and Forde}', set(['John Reed', 'Standish Forde'])],
                 ['{Joseph Stiles For Society, Society For The Relief Of Poor and Distressed Masters Of Ships Their Widows}', 'Joseph Stiles'],
                 ['{Willing Morris and Swanwick, Willing Morries and Swanwick}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{Budd Pryor, Budd and Pryor, Berdd and Pryor}', set(['John Budd', 'Norton Pryor'])],
                 ['{Wilhem and Jan Willink and William Bingham, William and Jan Willink and William Bingham}', set(['William Willink', 'Jan Willink', 'William Bingham'])],
                 ['{Reed and Forde, Reede and Ford}',set(['John Reed', 'Standish Forde'])],
                 ['{Nicholas And Hannah Coske, Nicholas And Hannah Cooke, Robert Crooke}', set(['Nicholas Coske', 'Hannah Coske', 'Nicholas Cooke', 'Hannah Cooke', 'Robert Crooke'])],
                 ['{Nathan Waterman And Robert Newell, N Waterman And R Newell}', set(['Nathan Waterman', 'Robert Newell'])],
                 ['{Nathan and Fredrick Williams, Nathan and Frederick Williams}', set(['Nathan Williams', 'Frederick Williams'])],
                 ['{Clark and Hammond}', set(['Ethan Clark'])],
                 ['{Peleg Shearman Transfer From The Register Of The Treasury, Peleg Shearman Transfer The Register, Peleg Shearman Transferred From The Register Of The Treasury}', set(['Peleg Shearman'])],
                 ['{Clark and Nightingale Transferred, Clark and Nightingale Transferred From Register}', set(['Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{Mess and Nicholas Brown Transferred, Moses and Nicholas Brown Transferred From Treasury, Moses and Nicholas Brown Transferred}', set(['Moses and Nicholas Brown'])],
                 ['{Benjamin Brown Transferred From Treasury, Benjamin Brown Transfer From Treasury, Benjamin Bowen}', set(['Benjamin Brown'])],
                 ['{Samuel Eddy Transferred From Treasury, Samuel Eddy Transfer From Treasury, Samuel Eddy}', set(['Samuel Eddy'])],
                 ['{Clark and Nightingale, Clark and Nightingale Transferred To Treasury}', set(['Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{James Manning Transferred From The Treasury, James Manning, James Manning Transferred From Treasury}', set(['James Manning'])],
                 ['{Philip Allen Transferred To Massachusetts, Philip Allen, Philip Allen Transferred From Massachusetts}', set(['Philip Allen '])],
                 ['{Clarkand Nightingale Transferred From The Register, Clark and Nightingale Transferred From The Reg, Clark and Nightingale Register}', set(['Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{Hopestill Mcneal, Hopestill Mcneal Transferred From The Register Of The Treasury}', set(['Hopestill McNeal'])],
                 ['{John Smith, John Smith Transferred From The Register Of The Treasury}', set(['John Smith'])],
                 ['{Nehemiah Rhodes Transferred From The Register Of The Treasury, Nehemiah Rhodes}', set(['Nehemiah Rhodes'])],
                 ['{Thomas Fry, Thomas Fry  Transferred From The Register Of The Treasury}', set(['Thomas Fry'])],
                 ['{John Mumford Transferred From The Treasury, John Mumford Transferred From Treasury, John Mumford Journal}', set(['John Mumford'])],
                 ['{Clark and Nightingale, Nicholas Banney Transfer From Register, Nathaniel Barney Transfer From Register}', set(['Joseph Innes Clark', 'Joseph Nightingale', 'Nathaniel Barney', 'Nicholas Banney'])],
                 ['{Clark and Nightingale Transfer From Register, William Larned}', set(['Joseph Innes Clark', 'Joseph Nightingale', 'William Larned'])],
                 ['{Matthew Watson Transferred From The Office In Connecticut, Christopher Hill Transferred From The Register, The President  Of Bank Of Providence From The Books In Massachusetts}', set(['Matthew Watson', 'Christopher Hill'])],
                 ['{William Littlefeild Transferred From The Office Of The Register, The President  Of Bank Of Providence From The Books In Massachusetts, John Warren Transferred From The Register}', set(['William Littlefeild', 'John Warren'])],
                 ['{John Mawney Transferred From The Register, Matthew Watson From The Office Of William Imlay Connecticut, Sarah Ward Transferred From The Register Of The Treasury}', set(['John Mawney', 'Matthew Watson', 'Sarah Ward', 'William Imlay'])],
                 ['{William Littlefield From The Office Of The Register Of Treasury, John Rogers Transferred From The Register Of The Treasury, Elisha Aldrich Transferred From The Register Of The Treasury}', set(['William Littlefield', 'John Rogers', 'Elisha Aldrich'])],
                 ['{William Littlefield From The Office Of The Register Of Treasury, John Rogers Transferred From The Register Of The Treasury, Elisha Aldrich Transferred From The Register Of The Treasury}', set(['William Littlefield', 'John Rogers', 'Elisha Aldrich'])],
                 ['{Dr William Handy In Trust For His Niece Mehetable Handy Transferred From The Books At New York, Warrant For Transferring Into The Books Of This Office Vizt, Warrant For Transfering Into the Books Of This Office Vizt}', set(['William Handy', 'Mehetable Handy'])],
                 ['{10926 In Favor Johh Brown From The Register Of The Treasury, John Mawney Transferred From The Register Of The Treasury, No 10925 In Favor Of John Brown From The Register Of The Treasury}', set(['John Brown', 'John Mawney'])],
                 ['{Warrats For Transfering Into The Books Of This Office Vizt, Warrants From The Secretary Of The Treasury For Transferring To The Books Of This Office Vizt, Jn Rogers Transfer From Register}', set(['Jn Rogers'])],
                 ['{John Eddy Transfer From Register, No 11749 In Fovar Christopher Smith From Register Of The Treasury, Christopher Smith No 11750}', set(['John Eddy', 'Christopher Smith'])],
                 ['{No 11723 In Favor Of James Bursill Jun From Register Of Treasury, Rev Ja Manning No 10315, Warrant For Transferring Into The Books Of This Office From The Register Of The Treasury Viz}', set(['James Bursill', 'Ja Manning'])],
                 ['{John Bowen No 11747}', set(['John Bowen'])],
                 ['{Willard Eddy No 11753}', set(['Willard Eddy'])],
                 ['{Benjamin Stelle No 9201}', set(['Benjamin Stelle'])],
                 ['{The Heirs Of George Guerin}', set(['George Guerin'])],
                 ['{Nicholas Low and Horace and S Johnson}', set(['Nicholas Low', 'Horace Johnson', 'S Johnson'])],
                 ['{Allexander Gillon and Robert Smith}', set(['Allexander Gillon', ' Robert Smith'])],
                 ['{Allexander Gillon and Robert Smith}', set(['Allexander Gillon', ' Robert Smith'])],
                 ['{Presstman and Calhoun}', set(['William Presstman'])],
                 ['{Wadsworth and Turpin}', set(['Thomas Wadsworth', 'William Turpin'])],
                 ['{Cross and Crowly}', set(['William George Cross', 'Charles Crowley'])],
                 ['{Barnabas Deane and Jeremiah Wadsworth, Barnabas Deane and J Wadsworth}', set(['Barnabas Deane', 'Jeremiah Wadsworth'])],
                 ['{Dan and E Boardman, Daniel and Elijah Boardman}', set(['Daniel Boardman', 'Elijah Boardman'])],
                 ['{Charles Stuart and James Mcculloch}', set(['Charles Stuart', 'James McCulloch'])],
                 ['{Willing Morris and Swanwick, Morris and Swanwick Willing, William Morris and Swanwick}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{Moses and Nicholas Brown}', set(['Moses Brown', 'Nicholas Brown'])],
                 ['{Jonathan and Jared Starr, Jon and Jacob Starr}', set(['Jonathan Starr', 'Jared Starr', 'Jacob Starr'])],
                 ['{Barnabas Deane and J Wadsworth, Barnabas Deane and Jeremiah Wadsworth}', set(['Barnabas Deane', 'Jeremiah Wadsworth'])],
                 ['{Daniel And Elijah Bor, Daniel And Elijah Boardman, Dan And Elijah Boardman}', set(['Daniel Boardman', 'Elijah Boardman'])],
                 ['{John Williams, John Williams Of Wethersfield}', set(['John Williams'])],
                 ['{Henry Zealor and Margaret Fisher, Zealor E Fisher}', set(['Henry Zealor', 'Margaret Fisher'])],
                 ['{Wallace and Muir, Wallace Muir}', set(['Charles Wallace','John Muir'])],
                 ['{The Estate Of John Jordon, John Jordans Estate, John Jordon}', set(['John Jordon'])],
                 ['{Lawrance Brangle and Jacob Gombare, Lawrence Brangle and Jacob Gombare, Lawrence Brangle and Jacob Gombar}', set(['Lawrence Brangle', 'Jacob Gombare'])],
                 ['{William Harwood, Thomas and Benjamin Harwood}', set(['William Harwood', 'Thomas Harwood', 'Benjamin Harwood'])],
                 ['{Stewart E Plunkett, Stewart and Plunkett, Stewart Plunkett}', set(['David Stewart', 'David Plunkett'])],
                 ['{William Morris and Swanwick, Willing Morris and Swanwick}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{John Ewing, John Ewing For Corp, Anthony Weiss}', set(['John Ewing', 'Anthony Weiss'])],
                 ['{Thomas Mifflin  Ex, Thomas Mifflin and Francis Johnston David Lenax and Charles Smith, His Executor Thomas Mifflin Francis Johnston David Lenox Charles Smith Ex To Ex Lawrence Keene}', set(['Thomas Mifflin', 'Francis Johnston', 'David Lenax', 'David Lenox', 'Charles Smith', 'Lawrence Keene'])],
                 ['{William Morris and Swanwick, Morris and Swanwick Willing}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{William Morris and Swanwick, Willing Morris and Swanwick, Morris and Swanwick Willing}', set(['Thomas Willing', 'Robert Morris', 'John Swanwick'])],
                 ['{William Sherer and Mcdowell, William Sherer and John Mcdowell}', set(['William Sherer', 'John McDowell'])],
                 ['{George Leib Of The N L, George Leib}', set(['George Leib'])],
                 ['{Catherine and William, Cath and William Coleman, Catharine and William Coleman Executor Of Jacob Coleman Deceased}', set(['Catherine Coleman', 'Catharine Coleman', 'William Coleman', 'Jacob Coleman'])],
                 ['{andrew Clow  Agents To James Brown, andrew Clow  Agents To James Brown Executor To The Est Of Hugh Patton, andrew Clow}', set(['Andrew Clow', 'James Brown', 'Hugh Patton'])],
                 ['{John Butler From Treasury, John Butler}', set(['John Butler'])],
                 ['{Ditta In Trust For Jonnet Thompson}', set(['Jonnet Thompson'])],
                 ['{Peter Vanderveer, Peter Vanderveer In Trust For Jacobus Van Eis}', set(['Peter Vanderveer', 'Jacobus Van Eis'])],
                 ['{N Waterman And R Newell, Nathan Waterman And Robert Newell}', set(['Nathan Waterman', 'Robert Newell'])],
                 ['{Nathan and Frederick Williams, Nathan and Fredrick Williams}', set(['Nathan Williams', 'Frederick Williams'])],
                 ['{Peleg Shearman Transfer From The Register Of The Treasury, Peleg Shearman Transferred From The Register Of The Treasury, Peleg Shearman Transfer The Register}', set(['Peleg Shearman'])],
                 ['{Clark and Nightingale Transferred From Register, Clark and Nightingale Transferred}', set(['Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{Mess and Nicholas Brown Transferred, Moses and Nicholas Brown Transferred, Moses and Nicholas Brown Transferred From Treasury}', set(['Moses Brown', 'Nicholas Brown'])],
                 ['{Benjamin Brown Transferred From Treasury, Benjamin Bowen, Benjamin Brown Transfer From Treasury}', set(['Benjamin brown', 'Benjamin Bowen'])],
                 ['{Samuel Eddy, Samuel Eddy Transferred From Treasury, Samuel Eddy Transfer From Treasury}', set(['Samuel Eddy'])],
                 ['{Clark and Nightingale Transferred From Register, Clark and Nightingale Transferred}', set(['Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{Philip Allen, Philip Allen Transferred From Massachusetts, Philip Allen Transferred To Massachusetts}', set(['Philip Allen'])],
                 ['{Clark and Nightingale Transferred From The Reg, Clarkand Nightingale Transferred From The Register, Clark and Nightingale Register}',
                  set(['Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{Hopestill Mcneal Transferred From The Register Of The Treasury, Hopestill Mcneal}', set(['Hopestill Mcneal'])],
                 ['{Nehemiah Rhodes, Nehemiah Rhodes Transferred From The Register Of The Treasury}', set(['Nehemiah Rhodes'])],
                 ['{The President  Of Bank Of Providence From The Books In Massachusetts, Matthew Watson Transferred From The Office In Connecticut, Christopher Hill Transferred From The Register}', set(['Matthew Watson', 'Christopher Hill'])],
                 ['{The President  Of Bank Of Providence From The Books In Massachusetts, John Warren Transferred From The Register, William Littlefeild Transferred From The Office Of The Register}', set(['William Littlefeild'])],
                 ['{Matthew Watson From The Office Of William Imlay Connecticut, John Mawney Transferred From The Register, Sarah Ward Transferred From The Register Of The Treasury}', set(['Matthew Watson', 'William Imlay', 'John Mawney', 'Sarah Ward'])],
                 ['{John Rogers Transferred From The Register Of The Treasury, William Littlefield From The Office Of The Register Of Treasury, Elisha Aldrich Transferred From The Register Of The Treasury}', set(['John Rogers', 'William Littlefield', 'Elisha Aldrich'])],
                 ['{Dr William Handy In Trust For His Niece Mehetable Handy Transferred From The Books At New York, Warrant For Transfering Into The Books Of This Office Vizt, Warrant For Transferring Into The Books Of This Office Vizt}', set(['William Handy', 'Mehetable Handy'])],
                 ['{10926 In Favor Johh Brown From The Register Of The Treasury, No 10925 In Favor Of John Brown From The Register Of The Treasury, John Mawney Transferred From The Register Of The Treasury}', set(['William Handy', 'Mehetable Handy'])],
                 ['{Warrants From The Secretary Of The Treasury For Transferring To The Books Of This Office Vizt, Jn Rogers Transfer From Register, Warrats For Transfering Into The Books Of This Office Vizt}', set(['John Rogers'])],
                 ['{Rev Ja Manning No 10315, No 11723 In Favor Of James Bursill Jun From Register Of Treasury, Warrant For Transferring Into The Books Of This Office From The Register Of The Treasury Viz}', set(['Ja Manning', 'James Bursill'])],
                 ['{James Manning, James Manning Transferred From Treasury, James Manning Transferred From The Treasury}', set(['James Manning'])],
                 ['{William Larned, Clark and Nightingale Transfer From Register}', set(['William Larned', 'Joseph Innes Clark', 'Joseph Nightingale'])],
                 ['{Society For The Relief Of Poor and Distressed Masters Of Ships Their Widows, Joseph Stiles For Society}', set(['Joseph Stiles'])],
                 ['{Appha Loomis, Appha Loomis Executor, Alpha Lommis Executor}', set(['Alpha Loomis', 'Appha Loomis'])],
                 ['{Oliver Mather, Oliver Mather Executor, Oliver Mather Executive}', set(['Oliver Mather'])],
                 ['{John Jordans Estate}', set(['John Jordan'])],
                 ['{Richard Downing  Jennings}', set(['Richard Downing Jennings'])],
                 ['{Avery  Hall}', set(['Avery Hall'])],
                 ['{Samuel Athenton Transfer, Samuel Atherton Transfer}', set(['Samuel Athenton', 'Samuel Atherton'])]]
company_names = dict(zip([c[0] for c in company_names], [c[1] for c in company_names]))
names_fix = names_fix.apply(lambda x: company_names.get(str(x).replace('\'',''), x))
names_fix.loc[3936] = set(['William Handy','Mehetable Handy'])

In [106]:
# remove names with transfer
names_fix = names_fix.apply(lambda x: set([ele.lower().split("transfer")[0].title().strip() for ele in list(x)]) if 'transfer' in str(x).lower() else x)

In [107]:
# final name list
name_df = names
name_df['Fixed Name'] = names_fix
# fix state
ind = name_df.query('name_type == "other" and not state.isna() and state != "BVI" and state != "BM" and state != "GB" and state != "VI" and state != "FR" and state != "US"').index
name_df.loc[ind, 'name_type'] = 'state'
name_df.loc[name_df[name_df['name_type'] == 'town]'].index, 'name_type'] = 'town'
set_conv = dict(zip(name_df['Fixed Name'].astype(str), name_df['Fixed Name']))
name_conv = dict(zip(name_df['Name'].astype(str), name_df['Name']))
name_df

Unnamed: 0,town,county,state,name_type,Name,Fixed Name
0,Hartford,Hartford County,CT,town,[Samuel W Pomeroy],{Samuel W Pomeroy}
1,Bolton,Tolland County,CT,town,[Benjamin Trumbull],{Benjamin Trumbull}
2,,,RI,state,[Richard Green],{Richard Green}
3,Hartford,Hartford County,CT,town,[Thomas Hopkins],{Thomas Hopkins}
4,Hartford,Hartford County,CT,town,[John Morgan],{John Morgan}
...,...,...,...,...,...,...
4370,Charleston,Charleston County,SC,town,[William Graham],{William Graham}
4371,,,RI,state,[John Updike],{John Updike}
4373,,,MD,state,[George Parker],{George Parker}
4374,,,MD,state,[Edward Ireland],{Edward Ireland}


In [108]:
# store numeric suffix for location
#locationsuffix = dict()

## Scrape Data From Ancetry

In [109]:
# store data
#df_list = []

In [110]:
# main function that obtains data on individuals
# basically this function just takes a link and outputs data on them
def listPeople(driver, fn, ln, samelocation = False, expandGeography = False, expandNameMatch = False):
    retVal =  ["end"]
    # if no matches found, loosen the restrictions on gegraphy and name
    try:
        count_text = driver.find_element(By.XPATH, "//*[@id=\"results-footer\"]/h3").text
    except:
        # feel free to swap the order, depending on your preferences
        # here i say we loosen name match restrictions before we expand geography
        if expandNameMatch:
            return "name"
        if expandGeography:
            return "geography"
        retVal.append("No Match: No Match Found")
        return retVal

    count = int(count_text.split(" of ")[1])
    # if multiple matches found, see if any of them are all in the same place
    # we can categorize this as a "location match"
    if count > 1:
        if samelocation:
            locationlist = []
            if count < 5: # likelihood of same location for over 5 individuals = rare
                for i in range(count):
                    location = driver.find_element(By.XPATH, f"//*[@id=\"sRes-{i}\"]/td[3]").text.split(",")[1]
                    locationlist.append(location)
                if len(set(locationlist)) == 1:
                    name = fn + " " + ln
                    retVal.extend([f"Name: {name}", f"Location: {locationlist[0]}"])
                else:
                    retVal.append(f"No Match: Too Many Potential Matches Found {count}")
            else:
                retVal.append(f"No Match: Too Many Potential Matches Found {count}")
        else:
            retVal.append(f"No Match: Too Many Potential Matches Found")
        return retVal
    # if only one name is found then we categorize this as a person match
    else:
        fname = driver.find_element(By.XPATH, f"//*[@id=\"sRes-0\"]/td[2]/span/span[1]").text
        try:
            lname  = driver.find_element(By.XPATH, f"//*[@id=\"sRes-0\"]/td[2]/span/span[2]").text
        except:
            try:
                lname = driver.find_element(By.XPATH, f"//*[@id=\"sRes-0\"]/td[2]/span/span/span[2]").text
            except:
                lname = driver.find_element(By.XPATH, "//*[@id=\"sRes-0\"]/td[2]/span/span/text()").text
        name = fname + " " + lname
        location = driver.find_element(By.XPATH, f"//*[@id=\"sRes-0\"]/td[3]").text
        slavecount = driver.find_element(By.XPATH, f"//*[@id=\"sRes-0\"]/td[4]").text
        familysize = driver.find_element(By.XPATH, f"//*[@id=\"sRes-0\"]/td[5]").text
        # print output
        retVal.extend([f"Name: {name}", f"Location: {location}", f"Family Size: {familysize}", f"Slavecount: {slavecount}"])
        return retVal

In [111]:
def processLocationString(name_type, town, county, state):
    if name_type == "town":
        return town + ", " + county + ", " + statedict[state]
    elif name_type == "county":
        return county + ", " + statedict[state]
    elif name_type == "state" or name_type == "state_flag":
        return statedict[state]
    else:
        return "United States"

In [112]:
def searchLocationString(name_type, town, county, state):
    if not pd.isnull(county):
        county = county.replace('County', '').strip().replace(' ', '+').replace('\'', '+').lower()
    if name_type == "town":
        return town.lower() + "-" + county + "-" + statedict[state].lower() + "-usa"
    elif name_type == "county":
        return county + "-" + statedict[state].lower() + "-usa"
    elif name_type == "state" or name_type == "state_flag":
        return statedict[state]+ "-usa"
    else:
        return "usa"

In [113]:
def navigateTo(fn, ln, driver, namesuffix , locsuffix, town, county, state, name_type, initial = False):
    fn = fn.replace(" ", "+")
    ln = ln.replace(" ", "+")

    if initial:
        locationstr = processLocationString(name_type, town, county, state)
        if locationstr in locationsuffix.keys():
            locationnum = locationsuffix[locationstr]
            searchstr = searchLocationString(name_type, town, county, state)
            url = f"https://www-ancestrylibrary-com.proxy.uchicago.edu/search/collections/5058/?name={fn}_{ln}&name_x={namesuffix}&residence=_{searchstr}_{locationnum}&residence_x={locsuffix}"
            driver.get(url)
            time.sleep(3)
        else:
            driver.get('https://www-ancestrylibrary-com.proxy.uchicago.edu/search/collections/5058/')
            time.sleep(8)
            val = driver.find_element(by = By.XPATH, value ="//*[@id=\"sfs_ContentBased\"]/div[1]/div/fieldset[1]/div[2]/label").get_attribute("for").split("Place_")[1]

            driver.find_element(by=By.XPATH, value=f"//*[@id=\"sfs__SelfResidencePlace_{val}\"]").send_keys(locationstr)
            time.sleep(2)
            driver.find_element(by=By.XPATH, value=f"//*[@id=\"sfs__SelfResidencePlace_{val}Autocomplete0\"]").click()
            time.sleep(1)
            driver.find_element(by=By.XPATH, value=f"//*[@id=\"sfs__SelfResidencePlace_{val}\"]").send_keys(Keys.ENTER)
            time.sleep(7)
            currurl = driver.current_url

            try:
                code = currurl.split("usa_")[1]
                locationsuffix[locationstr] = code
            except:
                print(f"error saving {locationstr}")

            url = currurl.split("?")[0] + f"?name={fn}_{ln}&name_x={namesuffix}&" + currurl.split("?")[1] + f"&residence_x={locsuffix}"
            print(url)
            driver.get(url)
            time.sleep(3)
    else:
        currurl = driver.current_url
        url = currurl.split("&name_x")[0] + f"&name_x={namesuffix}" + "&residence=_" + currurl.split("&residence=_")[1].split("&residence_x=")[0] + f"&residence_x={locsuffix}"
        driver.get(url)
        time.sleep(1)
    return driver, url

In [114]:
# function that controls settings for strictness of search and returns final data for each individual
def findMatches(fn, ln, driver, locationmatchlist , namematchlist, town, county, state, name_type):
    locationmatchind = 0
    namematchind = 0
    geobool = True
    namebool = True

    # navigate to original url
    driver, og_url = navigateTo(fn, ln, driver, namematchlist[namematchind] , locationmatchlist[locationmatchind], town, county, state, name_type, initial = True)
    time.sleep(1)

    # see if there are any matches using initial strict settings
    val = listPeople(driver, fn, ln, samelocation = True, expandGeography = geobool, expandNameMatch = namebool)
    # if we have found a match or we have exhausted all possible geo/name restriction loosenings, end search
    while type(val) != list and (geobool or namebool):
        # command to do a geographic expansion
        if val == "geography":
            if namematchind == len(namematchlist) - 1 and locationmatchind == 0:
                namematchind = 0
                namebool = True
            locationmatchind+=1
            if locationmatchind < len(locationmatchlist):
                # navigate to url with new settings
                driver, og_url = navigateTo(fn, ln, driver, namematchlist[namematchind] , locationmatchlist[locationmatchind], town, county, state, name_type)
                time.sleep(1)
                val = listPeople(driver, fn, ln, samelocation = True, expandGeography = geobool, expandNameMatch = namebool)
            else:
                # if we have exhausted all options, prevent future geographic expansions
                locationmatchind -= 1 # reset it to most loose setting
                geobool = False
                val = listPeople(driver, fn, ln, samelocation = True, expandGeography = geobool, expandNameMatch = namebool)
        # command to do name expansion
        elif val == "name":
            namematchind += 1
            if namematchind < len(namematchlist):
                # navigate to url with new settings
                driver, og_url = navigateTo(fn, ln, driver, namematchlist[namematchind] , locationmatchlist[locationmatchind], town, county, state, name_type)
                time.sleep(1)
                val = listPeople(driver, fn, ln, samelocation = True, expandGeography = geobool, expandNameMatch = namebool)
            else:
                # if we have exhausted all options, prevent future name expansions
                namematchind -= 1 # reset it to most loose setting
                namebool = False
                val = listPeople(driver, fn, ln, samelocation = True, expandGeography = geobool, expandNameMatch = namebool)
    val.extend([og_url])
    return val

In [115]:
# split result of scraping, depending on how much info was obtained
def parseScrapeResult(res, i, index, og_name, name,):
    remdata = []
    if len(res) == 3: # nomatch
        matchstatus = res[1].split(": ")[0]
        matchreason = res[1].split(": ")[1]
        remdata.extend([matchstatus, matchreason, np.nan, np.nan, np.nan, res[2]])
    elif len(res) == 4: # location match
        matchstatus = "Match"
        matchreason = "Only Location Found"
        location = res[2].split(": ")[1]
        remdata.extend([matchstatus, matchreason, location, np.nan, np.nan, res[3]])
    else:
        matchstatus = "Match"
        matchreason = "Full Match"
        name = res[1].split(": ")[1]
        location = res[2].split(": ")[1]
        familycnt = res[3].split(": ")[1]
        slavecnt = res[4].split(": ")[1]
        remdata.extend([matchstatus, matchreason, location, familycnt, slavecnt, res[5]])
    data = [index, og_name, name, i]
    data.extend(remdata)
    return data

In [116]:
def determineMatchList(name_type, townmatchlist, countymatchlist, statematchlist, stateflagmatchlist, othermatchlist):
    if name_type == "town":
        return townmatchlist
    elif name_type == "county":
        return countymatchlist
    elif name_type == "state":
        return statematchlist
    elif name_type == "state_flag":
        return stateflagmatchlist
    else:
        return othermatchlist

In [120]:
# iterate through all individuals and try to find data for them
assert(user == "Chris")
driver = webdriver.Safari(executable_path=r'/usr/bin/safaridriver')#set driver
file = pd.read_csv('~/Desktop/login_details.txt')
username = file.columns[0]
password = file[username].tolist()[0]
driver.set_window_size(1400,1000)

driver.get("https://www-ancestrylibrary-com.proxy.uchicago.edu/search/collections/5058/")#get website from nyu database

WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "okta-signin-username"))).send_keys(username)
time.sleep(1)
driver.find_element(by=By.ID, value="okta-signin-password").send_keys(password)
time.sleep(1)
driver.find_element(by=By.ID, value="okta-signin-submit").click() #sign in

WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"//*[@id=\"form62\"]/div/div[2]/iframe")))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"auth_methods\"]/fieldset/div[1]/button"))).click()

time.sleep(17)

# specify namematch and locationmatch options
townmatchlist = ["1-0_1-0", "1-0_1-1"]
countymatchlist = ["1-0_1-0"]
statematchlist = ["1-0_1-0"]
stateflagmatchlist = ["1-0_1-0-a"]
othermatchlist = ["1-0_1-0"]
namematchlist = ["1_1", "s_s", "ps_ps"]
for index in name_df.index:
    if index % 100 == 0:
        df = pd.DataFrame(columns = ['Index', 'Original Name', 'Search Name', 'Name #', 'Match Status', 'Match Reason', 'Location', 'Family Size', 'Slavecount', 'url', 'town', 'county', 'state', 'name_type'],
                          data = df_list)
        df.to_csv('CD_results.csv')
    if index in [2272, 2559, 2708, 3005]: #1528 is invalid name
        og_name = list(name_df.loc[index]['Name'])
        if og_name == ['']:
            continue
        fixed_name = list(name_df.loc[index]['Fixed Name'])
        name_type = name_df.loc[index]['name_type']
        town = name_df.loc[index]['town']
        county = name_df.loc[index]['county']
        state = name_df.loc[index]['state']
        locationmatchlist = determineMatchList(name_type, townmatchlist, countymatchlist, statematchlist, stateflagmatchlist, othermatchlist)

        # distinguish between names that correspond to multiple real names vs. just one name
        if len(fixed_name)>1:
            print(f"Original Name: {og_name}, Search Names:{fixed_name}, Index: {index}")
            i = 0
            for name in fixed_name:
                i += 1
                print(f"Name {i}: {name}")
                fn = " ".join(name.split(" ")[:-1])  if ("Van" not in name or "Ten" not in name) else " ".join(name.split(" ")[:-2])
                ln = name.split(" ")[-1] if ("Van" not in name or "Ten" not in name or "Ii" not in name) else " ".join(name.split(" ")[-2:])
                res = findMatches(fn, ln, driver, locationmatchlist, namematchlist, town, county, state, name_type)
                data = parseScrapeResult(res, 1, index, og_name, name)
                data.extend([town, county, state, name_type])
                df_list.append(data)
                print([val + ": " + str(d) for val, d in zip(df.columns, data)])
                print("")
        else:
            print(f"Original Name: {og_name}, Search Name:{fixed_name}, Index: {index}")
            name = fixed_name[0]
            fn = " ".join(name.split(" ")[:-1])  if ("Van" not in name or "Ten" not in name) else " ".join(name.split(" ")[:-2])
            ln = name.split(" ")[-1] if ("Van" not in name or "Ten" not in name or "Ii" not in name) else " ".join(name.split(" ")[-2:])
            res = findMatches(fn, ln, driver, locationmatchlist, namematchlist, town, county, state, name_type)
            data = parseScrapeResult(res, 1, index, og_name, fixed_name)
            data.extend([town, county, state, name_type])
            df_list.append(data)
            print([val + ": " + str(d) for val, d in zip(df.columns, data)])
            print("")

Original Name: ['John Otto'], Search Name:['John Otto'], Index: 2272
error saving Reading Berks, Berks County, Pennsylvania
https://www-ancestrylibrary-com.proxy.uchicago.edu/search/collections/5058/?name=John_Otto&name_x=1_1&residence=_Reading+Berks-Berks+County-Pennsylvania&residence_x=1-0_1-0
['Index: 2272', "Original Name: ['John Otto']", 'Search Name: John Otto', 'Name #: 1', 'Match Status: Match', 'Match Reason: Full Match', 'Location: Reading, Berks, Pennsylvania', 'Family Size: 10', 'Slavecount: \xa0', 'url: https://www-ancestrylibrary-com.proxy.uchicago.edu/search/collections/5058/?name=John_Otto&name_x=1_1&residence=_Reading+Berks-Berks+County-Pennsylvania&residence_x=1-0_1-0', 'town: Reading Berks', 'county: Berks County', 'state: PA', 'name_type: town']

Original Name: ['Andrew Douglass'], Search Name:['andrew Douglass'], Index: 2559
['Index: 2559', "Original Name: ['Andrew Douglass']", 'Search Name: Andrew Douglass', 'Name #: 1', 'Match Status: Match', 'Match Reason: Full 

In [166]:
# export data
df = pd.DataFrame(columns = ['Index', 'Original Name', 'Search Name', 'Name #', 'Match Status', 'Match Reason', 'Location', 'Family Size', 'Slavecount', 'url', 'town', 'county', 'state', 'name_type'],
                  data = df_list)

In [174]:
df.loc[df[df['Slavecount'].apply(lambda x: x == '\xa0')].index, 'Slavecount'] = np.nan
df['Search Name'] = df['Search Name'].apply(lambda x: [x.split('\xa0')[0]] if type(x) != list else x)

In [182]:
df.to_csv('CD_results.csv')