# CLEANING DELEGATES

In [1]:
import numpy as np
import pandas as pd
import re

In [3]:
delegates = pd.read_excel("../../Data/Delegates/constitutional_convention_1787.xlsx", header = 2)

#cleaning strings for constitutional convention delegates
delegates['state'] = [x.strip() for x in delegates['state']]
delegates['sign'] = [x.strip() for x in delegates['sign?']]
delegates['first name'] = [x.strip() for x in delegates['first name']]
delegates['last name'] = [x.strip() for x in delegates['last name']]

#might want to separate names with parentheses into last name 1/2
sumaltfirst = sum(["(" in x for x in delegates['first name']])
print("number of alt first names:", sumaltfirst)

sumaltlast = sum(["(" in x for x in delegates['last name']])
print("number of alt last names:", sumaltlast)


#construct full names for delegate files
delegates['last name 2'] = [np.nan if "(" not in x else  x[x.find("(")+1:].replace(")","")  for x in delegates['last name']]
#handle special case
delegates.loc[40, 'last name 2'] = 'Fitzsimmons'

delegates['last name'] = delegates['last name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
delegates['full name 1'] = delegates['first name'] + " " + delegates['last name']
delegates['full name 2'] = delegates['first name'] + " " + delegates['last name 2']

number of alt first names: 0
number of alt last names: 4


In [4]:
state_delegates = pd.read_excel("../../Data/Delegates/State Delegates.xlsx", header = 2)

#cleaning strings for state convention delegates
missing_ind = state_delegates[state_delegates['First Name'].apply(lambda x: type(x) != str)].index
state_delegates.loc[missing_ind, 'First Name'] = ['William', 'Ebenezer']
# Rhode Island last & first name columns are flipped
RI_index = state_delegates[state_delegates['State'] == 'RI'].index
ln = state_delegates.loc[RI_index, 'Last Name']
state_delegates.loc[RI_index, 'Last Name'] = state_delegates.loc[RI_index, 'First Name']
state_delegates.loc[RI_index, 'First Name'] = ln

state_delegates['full name 1'] = state_delegates['First Name'] + " " + state_delegates['Last Name']


In [5]:
sd_vc = state_delegates['full name 1'].value_counts()
dups = sum(state_delegates['full name 1'].value_counts()) - sd_vc.shape[0]

sds_vc = state_delegates[['full name 1','State']].value_counts()
dups_state = sum(state_delegates['full name 1'].value_counts()) - sds_vc.shape[0]

print("{} duplicated names".format(dups))
print("{} duplicated names after also grouping by state".format(dups_state))
# tells us that even at best, we won't be able to match 15 of the delegates from state conventions

sd_names = sd_vc[sd_vc == 2].index
sds_names = sds_vc[sds_vc == 2].index

# label duplicated names
state_delegates['name duplication'] = pd.to_numeric(state_delegates['full name 1'].apply(lambda x: x in sd_names))
state_delegates['state-name duplication'] = pd.to_numeric([(name, state) in sds_names for state, name in zip(state_delegates['State'], state_delegates['full name 1'])])

56 duplicated names
15 duplicated names after also grouping by state


In [6]:

delegates.to_csv('../Data/Delegates/cleaned/constitutional_convention_delegates_cleaned.csv')
state_delegates.to_csv('../Data/Delegates/cleaned/State_Delegates_cleaned.csv')

# CLEAN LOAN CERTIFICATES

In [7]:
import math
import spacy

In [8]:
delegates_og = pd.read_csv("../Data/Delegates/cleaned/constitutional_convention_delegates_cleaned.csv", index_col = 0)
delegates = delegates_og.copy()

## Clean Loan Office Certificates from the 9 States

In [9]:
loans_og = pd.read_excel("../Data/Pre1790/loan_office_certificates_9_states.xlsx")

In [10]:
# create copy
loans = loans_og.copy()

In [11]:
loans

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Face Value,Specie Value
0,1,1778,3,13.0,Col,Joshua,Wentworth,,,,200,108.27780
1,1,1777,9,2.0,,Charles,Treadwell,,,,200,199.37780
2,1,1777,9,10.0,,Stephen,Cleverly,,,,200,194.51110
3,1,1777,9,13.0,,David,Griffith,,,,200,192.71110
4,1,1777,9,15.0,,John,Mansfield,,,,200,191.52220
...,...,...,...,...,...,...,...,...,...,...,...,...
80908,9,1780,1,1.0,,John Hay,,,,,400,13.61667
80909,9,1779,5,25.0,,Isaac & Thoroughgood Smith,,,,,300,22.86250
80910,9,1779,5,25.0,,Isaac & Thoroughgood Smith,,,,,800,60.96667
80911,9,1779,3,13.0,,Samuel Oldham,,,,,500,48.11806


In [12]:
#create column for third name 
loans['Title 3'] = pd.Series(dtype = 'object')
loans['First Name 3'] = pd.Series(dtype = 'object')
loans['Last Name 3'] = pd.Series(dtype = 'object')

#create column to store original text stored in first name column, and notes on what I changed
loans['notes'] = pd.Series(dtype = 'object')
loans['original text'] = pd.Series(dtype = 'object')

In [13]:
#reorganize columns
loans = loans[list(loans.columns[:10]) + list(loans.columns[12:15]) + list(loans.columns[10:12]) + ['notes','original text']]

In [14]:
# function to handle isNan and other datatypes
def betterIsNan(x):
    try: 
        return math.isnan(x)
    except:
        return False

print("checking if there are any columns with valid last name 1 and first name 2")
# this would be a problem, we wouldn't know if one or two individuals owned each stock
any([betterIsNan(x) and betterIsNan(y) and not betterIsNan(w) and not betterIsNan(z) for w, x, y, z in zip(loans['First Name 1 '], loans['Last Name 1 '], loans['First Name 2'], loans['Last Name 2'])])

checking if there are any columns with valid last name 1 and first name 2


False

In [15]:
print("checking if there are any columns with valid last name 2 and first name 1")
# this would be a problem, we wouldn't know if one or two individuals owned each stock
any([betterIsNan(w) and betterIsNan(z) and not betterIsNan(x) and not betterIsNan(y) for w, x, y, z in zip(loans['First Name 1 '], loans['Last Name 1 '], loans['First Name 2'], loans['Last Name 2'])])

checking if there are any columns with valid last name 2 and first name 1


False

In [16]:
def replaceWhitespace(x):
    return " ".join(x.split()) if type(x) != float else x

In [17]:
loans['First Name 1 '] = loans['First Name 1 '].apply(lambda x: replaceWhitespace(x))

In [18]:
# define situations where certain certificates cannot be matched

In [19]:
# null value
loans.loc[[pd.isnull(fn) and pd.isnull(ln) for fn, ln in zip(loans['First Name 1 '], loans['Last Name 1 '])], 'match eligible'] = False
# no first name, yes last name
loans.loc[[pd.isnull(fn) and not pd.isnull(ln) for fn, ln in zip(loans['First Name 1 '], loans['Last Name 1 '])], ['notes', 'match eligible']] = [['possibly Pierce Long', False], [np.nan, False], ['possibly John Witherspoon', False]]
# 1 character first name, no last name
loans.loc[[(len(fn.split())==1 if type(fn) != float else False) and pd.isnull(ln) for fn, ln in zip(loans['First Name 1 '], loans['Last Name 1 '])], 'match eligible'] = False

In [20]:
# define "weird first names" as names with more than one character in their first name, and no last name

In [21]:
weird_first = [(len(fn.split())>1 if type(fn) != float else False) and pd.isnull(ln) for fn, ln in zip(loans['First Name 1 '], loans['Last Name 1 '])]
weird_first_names = loans.loc[weird_first]
weird_first_names

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
2428,2,1777,3,10.0,,Lee & Jones,,,,,,,,600,600.00000,,,
2759,2,1777,4,11.0,,Clark & Nightingale,,,,,,,,400,400.00000,,,
2760,2,1777,4,11.0,,Clark & Nightingale,,,,,,,,400,400.00000,,,
2761,2,1777,4,11.0,,Clark & Nightingale,,,,,,,,400,400.00000,,,
2762,2,1777,4,11.0,,Clark & Nightingale,,,,,,,,400,400.00000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80908,9,1780,1,1.0,,John Hay,,,,,,,,400,13.61667,,,
80909,9,1779,5,25.0,,Isaac & Thoroughgood Smith,,,,,,,,300,22.86250,,,
80910,9,1779,5,25.0,,Isaac & Thoroughgood Smith,,,,,,,,800,60.96667,,,
80911,9,1779,3,13.0,,Samuel Oldham,,,,,,,,500,48.11806,,,


<h2> Handle Corrections of Executors </h2>

How do we want to consider executors? Are they irrelevant (ie: do we only care about the person who they are executing for, or do we also care about them - and just add a note saying "executor")? 

Empirically, what interest do the executors have? They probably don't have any direct financial interest in the debt but there could definitely be incentives for them to be interested in the stability of the debt, such as shared business interests, desire to properly help execute a will etc?

What do we do when we cannot properly identify executors? Or trustees?

In [22]:
# I use the title executor to generaly indicate someone keeping money for another - could also be trustee etc

In [23]:
# trend below - if its Executors just get Atkinsons
# if its David Mackey and Debt Executor just do David Mackey
# for jacob lentz and phillip fisher lentz I kept jacob fisher and removed philip lentz
# there are records for jacob fisher being a soldier but no records for philip lentz
# otherwise find the word "to" or "of" and get the name right after that

In [24]:
weird_first_names[weird_first_names['First Name 1 '].apply(lambda x: "exec" in x.lower() or "executor" in x.lower() or "ex." in x.lower())].shape

(40, 18)

In [25]:
executor = weird_first_names[weird_first_names['First Name 1 '].apply(lambda x: "exec" in x.lower() or "executor" in x.lower() or "ex." in x.lower())]
executor.shape

(40, 18)

In [26]:
#executor['First Name 1 '].unique().tolist()

In [27]:
for x in executor.loc[[39712, 39713, 40030]]['First Name 1 ']:
    print(x)

Robert Morris and John Simon executor to the estate of
Robert Morris and John Simon executor to the estate of
Robert Morris and John Simon executor to the estate of R Gerraty


In [28]:
executor_names = executor['First Name 1 ']
#handle special cases
executor_names = executor_names.replace("Atkinsons Executors", "Atkinsons")
executor_names = executor_names.replace("David Mackey and Debt Executor", "David Mackey")
executor_names = executor_names.replace("Jacob Fisher Philip Lentz Ex.", "Jacob Fisher")
executor_names = executor_names.replace("Robert Morris and John Simon executor to the estate of", "Robert Morris and John Simon executor to the estate of R Gerraty")
spec_status_names = ("David Mackey", "Atkinsons", "Jacob Fisher", "Robert Morris") 
#find string after the words to or of
post_of = executor_names.apply(lambda x: x[x.lower().find(' to '):] if x not in spec_status_names else x)
post_to = executor_names.apply(lambda x: x[x.lower().find(' of '):] if x not in spec_status_names else x)
#merge results - we keep the result that occurred later, because we wan tot capture everything before of/to
pre_processing_executor_names = pd.Series(x if len(x) >= len(y) else y for x, y in zip(post_of, post_to))
#remove stop words
pre_processing_executor_names = pre_processing_executor_names.apply(lambda x: x.replace("to", "").replace("To", "").replace("of", "").replace("Of"," ").strip())
pre_processing_executor_names = pre_processing_executor_names.apply(lambda x: x.replace("the", "").replace("  ", " ").strip())
pre_processing_executor_names.unique()

array(['Sam Hasting', 'B Winchester', 'Will Thomas Hubbard Esq deceased',
       'Benj Winchester', 'estate R Gerraty',
       'estate Jacob Carver Deceased', 'Stephen Boyd', 'David Mackey',
       'Leon Ampach', 'Conaard Teulps', 'Estale Ju Eighter',
       'Jacob Fisher', 'Mat Mease and Co.', 'Jacob Calvert',
       'Christ V. Cross', 'Atkinsons'], dtype=object)

In [29]:
executor_names = dict(zip(pre_processing_executor_names, pre_processing_executor_names))

In [30]:
executor_names['estate Jacob Carver Deceased'] = 'Jacob Carver'
executor_names['estate R Gerraty'] = 'R Gerraty'
executor_names['Will Thomas Hubbard Esq deceased'] = 'Will Thomas Hubbard'
executor_names['Mat Mease and Co.'] = 'Mat Mease'
post_processing_executor_names = [executor_names[x] for x in pre_processing_executor_names]
post_processing_executor_index = executor['First Name 1 '].index
#post_processing_executor_names

In [31]:
#changing recipient of all the money
executor['original text'] = executor['First Name 1 '].copy()
executor['First Name 1 '] = post_processing_executor_names
executor['Last Name 1 '] = [x.split(" ")[-1] for x in executor['First Name 1 ']]
executor['First Name 1 '] = [" ".join(x.split(" ")[:-1]) for x in executor['First Name 1 ']]
#executor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['original text'] = executor['First Name 1 '].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['First Name 1 '] = post_processing_executor_names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['Last Name 1 '] = [x.split(" ")[-1] for x in executor['First Name 1 ']]
A va

In [32]:
executor_names = executor['original text']
#handle special cases
executor_names = executor_names.replace("Atkinsons Executors", "")
executor_names = executor_names.replace("David Mackey and Debt Executor", "David Mackey")
executor_names = executor_names.replace("Jacob Fisher Philip Lentz Ex.", "Philip Lentz")
executor_names = executor_names.replace("Robert Morris and John Simon executor to the estate of", "Robert Morris and John Simon")
executor_names = executor_names.replace("William Allison Ex. Of Mat Mease and Co.", "William Allison")
spec_status_names = ("David Mackey", "", "Philip Lentz", "Robert Morris and John Simon", "William Allison") 
#find string after the words to or of
post_exec = executor_names.apply(lambda x: x[:x.lower().find(' exec')] if x not in spec_status_names else x)
post_exec_to = executor_names.apply(lambda x: x[:x.lower().find(' to')] if x not in spec_status_names else x)
#merge results
pre_processing_executors = pd.Series(x if len(x) < len(y) else y for x, y in zip(post_exec, post_exec_to)).apply(lambda x: x.replace("Ex.", "").strip())
#remove stop words
#pre_processing_executors

In [33]:
names1 = []
names2 = []
for name in pre_processing_executors:
    names = name.split("and")
    if len(names) > 1:
        names1.append(names[0].strip())
        names2.append(names[1].strip())
    else:
        names2.append(np.nan)
        names1.append(name.strip())
print(names1)
print(names2)

['Sam Austin', 'Benj Allen', 'Joseph Allen', 'William B', 'William B', 'William B', 'William B', 'William B', 'William B', 'William B', 'William B', 'William B', 'William B', 'To Allen', 'To Allen', 'Robert Morris', 'Robert Morris', 'Robert Morris', 'Isaac Roush', 'John Edwards', 'David Mackey', 'David Mackey', 'George Liller', 'George Kelcher', 'George Kelcher', 'John Penny Cacker', 'Philip Lentz', 'Philip Lentz', 'Philip Lentz', 'William Allison', 'William Allison', 'Isaac Rauch', 'Jacob Fisher', 'John Edwards', '', '', '', '', '', '']
[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'John Simon', 'John Simon', 'John Simon', 'Mary Utrey', nan, nan, nan, nan, 'Peter Sherer', 'Peter Sherer', nan, nan, nan, nan, nan, nan, 'Mary Sallie', 'Chris Lentz', nan, nan, nan, nan, nan, nan, nan]


In [34]:
#manually adjust william b to william b townsend because cod did not process it correctly
names1 = [name  if name != 'William B' else 'William B Townsend' for name in names1]

In [35]:
#changing recipient of all the money
executor['First Name 2'] = names1
executor['Last Name 2'] = [x.split(" ")[-1] if x != '' else x for x in names1]
executor['First Name 2'] = [" ".join(x.split(" ")[:-1]) if x != '' else x for x in executor['First Name 2'] ]
executor['Title 2'] = ["executor" if x != '' else np.nan for x in executor['First Name 2']]

executor['First Name 3'] = names2
executor['Last Name 3'] = [x.split(" ")[-1]  if not pd.isnull(x) else x for x in executor['First Name 3']]
executor['First Name 3'] = [" ".join(x.split(" ")[:-1])  if not pd.isnull(x) else x for x in executor['First Name 3']]
executor['Title 3'] = ["executor" if not pd.isnull(x) else np.nan for x in executor['First Name 3']]
#executor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['First Name 2'] = names1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['Last Name 2'] = [x.split(" ")[-1] if x != '' else x for x in names1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['First Name 2'] = [" ".join(x.split(" ")[:-1]) if x != '' else x for x in executor[

In [36]:
executor['notes'][[x == "Jacob" and y == "Carver" for x, y in zip(executor['First Name 1 '], executor['Last Name 1 '])]] = "Deceased, Estate"
executor['notes'][[x == "R" and y == "Gerraty" for x, y in zip(executor['First Name 1 '], executor['Last Name 1 '])]] = "Estate"
executor['notes'][[x == "Will Thomas" and y == "Hubbard" for x, y in zip(executor['First Name 1 '], executor['Last Name 1 '])]] = "Deceased"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['notes'][[x == "Jacob" and y == "Carver" for x, y in zip(executor['First Name 1 '], executor['Last Name 1 '])]] = "Deceased, Estate"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['notes'][[x == "R" and y == "Gerraty" for x, y in zip(executor['First Name 1 '], executor['Last Name 1 '])]] = "Estate"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  executor['notes'][[x == "Will Thomas" and y == "Hubbard" for x, y in zip(executor['First Name 1 '], execu

<h3> Modify weird_first_names to include corrections </h3>

In [37]:
#make adjustments here
weird_first_names.loc[executor.index] = executor

In [38]:
executor_cleaned = weird_first_names.loc[executor.index]
filter_out_names = executor_cleaned.index.tolist()
#executor_cleaned

In [39]:
#trend below - if its two and's deal with first case below
#if its two &'s deal with second case below
#otherwise just split the names

In [40]:
two_names = weird_first_names[weird_first_names['First Name 1 '].apply(lambda x: ("&" in x or " and " in x.lower()))]
new_index = [x not in filter_out_names for x in two_names.index]
two_names = two_names.loc[new_index]
two_names.shape

(2261, 18)

<h2> Handle corrections of names with weird combinators </h2> 

In [41]:
multiple_and = two_names[two_names['First Name 1 '].apply(lambda x: x.count('and') > 1)]
multiple_and['original text'] = multiple_and['First Name 1 ']
#multiple_and

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_and['original text'] = multiple_and['First Name 1 ']


In [42]:
#add sarah charman to name 3, executor
#add john cavott and jacob aivl as executors?? in name 3
#add oiriginal to the present string

In [43]:
multiple_and['First Name 1 '] = multiple_and['First Name 1 '].replace('Sarah Charman for the use of Gilbert Hammond and Cornelius Tommand', 
                                                                      'Gilbert Hammond and Cornelius Tommand')
multiple_and['First Name 1 '] = multiple_and['First Name 1 '].replace('John Cavott and Jacob Aivl for Moses Dichey and George Dukey', 
                                                                      'Moses Dichey and George Dukey')
multiple_and['First Name 1 '] = multiple_and['First Name 1 '].replace('Jn Nixon I M Nerbitt and Alexander Forster', 
                                                                      'Jn Nixon and I M Nerbitt and Alexander Forster')
multiple_and['First Name 1 '] = multiple_and['First Name 1 '].replace('Samuel Ely and Michael Gellington Esq and Co',
                                                                     'Samuel Ely and Michael Gellington Esq')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_and['First Name 1 '] = multiple_and['First Name 1 '].replace('Sarah Charman for the use of Gilbert Hammond and Cornelius Tommand',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_and['First Name 1 '] = multiple_and['First Name 1 '].replace('John Cavott and Jacob Aivl for Moses Dichey and George Dukey',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [44]:
tempindex = multiple_and[multiple_and['First Name 1 '].apply(lambda x: x == 'Gilbert Hammond and Cornelius Tommand')].index
multiple_and.loc[tempindex, ['First Name 3', 'Last Name 3', 'Title 3']] = ['Sarah', 'Charman', 'executor']
tempindex = multiple_and[multiple_and['First Name 1 '].apply(lambda x: x == 'John Cavott and Jacob Aivl')].index
multiple_and.loc[tempindex, ['First Name 3', 'Last Name 3', 'Title 3']] = ['John', 'Cavott', 'executor']
multiple_and.loc[tempindex, 'notes'] = ['Jacob Aivl executor']

<h3> Fix by replacing in two_names, not superior weird_first_names </h3> 

In [45]:
two_names.loc[multiple_and.index] = multiple_and
multiple_and_cleaned = two_names.loc[multiple_and.index]
multiple_and_cleaned['original text'] = multiple_and_cleaned['First Name 1 ']
#multiple_and_cleaned

In [46]:
multiple_and_symbol = two_names[two_names['First Name 1 '].apply(lambda x: x.count('&') > 1)]
#executive decision made - nic jacobs is executor of peter brown
#add that nicholas jacobs is executor
multiple_and_symbol['original text'] = multiple_and_symbol['First Name 1 ']
multiple_and_symbol['First Name 1 '] = multiple_and_symbol['First Name 1 '].replace('Peter Brown & Nic Jacobs Ex & …….', 
                                                                                    'Peter Brown')
tempindex = multiple_and_symbol[multiple_and_symbol['First Name 1 '].apply(lambda x: x == 'Peter Brown')].index
multiple_and_symbol.loc[tempindex, ['First Name 2', 'Last Name 2', 'Title 2']] = ['Nic', 'Jacobs', 'executor']
tempindex = multiple_and_symbol[multiple_and_symbol['First Name 1 '].apply(lambda x: x == 'Hre Rest & Ch & St Pr Chumess')].index
multiple_and_symbol.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 
                                    'Last Name 2', 'First Name 3', 'Last Name 3']] = ['Hre', 'Rest', 'Ch', 'St Pr', 'Chumess']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_and_symbol['original text'] = multiple_and_symbol['First Name 1 ']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_and_symbol['First Name 1 '] = multiple_and_symbol['First Name 1 '].replace('Peter Brown & Nic Jacobs Ex & …….',


In [47]:
two_names.loc[multiple_and_symbol.index] = multiple_and_symbol
two_names.loc[multiple_and_symbol.index]

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
57469,6,1779,8,14.0,,Hre,Rest,,,Ch,,St Pr,Chumess,500,29.41667,,Hre Rest & Ch & St Pr Chumess,
57680,6,1779,9,2.0,,Peter Brown,,executor,Nic,Jacobs,,,,400,22.13333,,Peter Brown & Nic Jacobs Ex & …….,


In [48]:
#checking to make sure no string has both "and" and "&" as split symbol
two_names[(two_names['First Name 1 '].apply(lambda x: "&" in x and " and " in x))]

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible


In [49]:
more_complicated = two_names[two_names['First Name 1 '].apply(lambda x: " in " in x or " of " in x or "to " in x or "for" in x)]
more_complicated_names = more_complicated['First Name 1 '].unique().tolist()

In [50]:
more_complicated['original text'] = more_complicated['First Name 1 ']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_complicated['original text'] = more_complicated['First Name 1 ']


In [51]:
print(len(more_complicated_names))
print(more_complicated_names)

30
['John Gray &Tho Dawes Fws.to Sarah Green', 'Jn Gray & Thomas Dawes Trustees to Sach Green', 'Nathaniel Appleton and other trustees of Judah Monis Legasy', 'John Barrett & Sons Trustees to Creditors of John Elsworth', 'Society for Relief of Poor Masters of ships widows and children', 'Corporation for the Relief of Poor and Distressed Presbyterian Ministers', 'Jeremiah Halsey and Sarah Gaston for the Estate of Gaston Dec', 'Wm Allison & Sam Caldwell Ex to the est of David Caldwell for children', 'Jacob Brush & Mary Wroop to the est of Jacob Carver Ded', 'Mathew Greer & Mathew Greer for the heirs of Thomas Jones Deceased', 'Joseph Jacket and Anthany Jacket Trustees of Presbyterian Congregation N town', 'Isaac Roush & Mary Bldney of Jacob Carver', 'Samuel Johnston Inna Hanna and I Johnston in trust for the Hierrs of Phil Johnston', 'Michele Shubart For Corporation of Michelle & Zion Churches', 'Corporation for relief of poor and distressed presbyterian ministers', 'Ed Keasby & J Bilber

In [52]:
#add notes saying that its an organization
#add trustees/people for - label as executor
#create dictionary corresponding to object that dictates how cases are tobe handled

In [53]:
#this code will be very long because its manual

In [54]:
tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x == 'John Gray &Tho Dawes Fws.to Sarah Green')].index
results = ['Sarah', 'Green', 'executor', 'John', 'Gray', 'executor', 'Tho', 'Dawes']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x == 'Jn Gray & Thomas Dawes Trustees to Sach Green')].index
results = ['Sarah', 'Green', 'executor', 'Jn', 'Gray', 'executor', 'Thomas', 'Dawes']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Nathaniel Appleton and other trustees of Judah Monis Legasy')].index
results = ['Judah', 'Monis', 'legacy', 'executor', 'Nathaniel', 'Appleton']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'John Barrett & Sons Trustees to Creditors of John Elsworth')].index
results = ['John', 'Elsworth', 'creditors', 'executor', 'John', 'Barrett']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Society for Relief of Poor Masters of ships widows and children')].index
results = ['organization', 'Society for Relief of Poor Masters of ships widows and children']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Corporation for the Relief of Poor and Distressed Presbyterian Ministers')].index
results = ['organization', 'Corporation for the Relief of Poor and Distressed Presbyterian Ministers']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x == 'Jeremiah Halsey and Sarah Gaston for the Estate of Gaston Dec')].index
results = ['Gaston', 'Dec', 'estate', 'executor', 'Jeremiah', 'Halsey', 'executor', 'Sarah', ' ']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Wm Allison & Sam Caldwell Ex to the est of David Caldwell for children')].index
results = ['David', 'Caldwell', 'estate for children', 'executor', 'Sam', 'Caldwell', 'executor', 'Wm', 'Allison']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Jacob Brush & Mary Wroop to the est of Jacob Carver Ded')].index
results = ['Jacob', 'Carver', 'estate, deceased', 'executor', 'Jacob', 'Brush', 'executor', 'Mary', 'Wroop']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==   'Mathew Greer & Mathew Greer for the heirs of Thomas Jones Deceased')].index
results = ['Thomas', 'Jones', 'heirs, deceased', 'executor', 'Mathew', 'Greer']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2']] = results

In [55]:
tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Joseph Jacket and Anthany Jacket Trustees of Presbyterian Congregation N town')].index
results = ['organization', 'Presbyterian Congregation N town', 'executor', 'Joseph', 'Jacket', 'executor', 'Anthany', 'Jacket']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Isaac Roush & Mary Bldney of Jacob Carver')].index
results = ['Jacob', 'Carver', 'executor', 'Mary', 'Bldney', 'executor', 'Isaac', 'Roush']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Samuel Johnston Inna Hanna and I Johnston in trust for the Hierrs of Phil Johnston')].index
results = ['Phil', 'Johnston', 'heirs', 'executor', 'Samuel', 'Johnston', 'executor', 'Inna Hanna', 'Johnston']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Michele Shubart For Corporation of Michelle & Zion Churches')].index
results = ['organization', 'Corporation of Michelle & Zion Churches', 'executor', 'Michele', 'Shubart']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Corporation for relief of poor and distressed presbyterian ministers')].index
results = ['organization', 'Corporation for relief of poor and distressed presbyterian ministers']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Ed Keasby & J Bilber by the Ers of J Dickinson')].index
results = ['J', 'Dickinson', 'executor', 'Ed', 'Keasby', 'executor', 'J', 'Bilber']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Gen. Kelehner and Pr. Sheser Esq. to State of Conrad Scheele')].index
results = ['Conrad', 'Scheele', 'estate', 'executor', 'Gen.', 'Kelehner', 'executor', 'Pr. Sheser', 'Esq']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Andrew Hagenbach and Catherine Brobst Guardians of John Brobst')].index
results = ['John', 'Brobst', 'executor', 'Andrew', 'Hagenbach', 'executor', 'Catherine', 'Brobst']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Robert Patterson Guardin for Sarah and Mary Stewart')].index
results = ['Sarah', 'Stewart', 'Mary', 'Stewart', 'executor', 'Robert', 'Patterson']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Peter Knight & Sus Woodrow Ex to the Estate of H Woodrow')].index
results = ['H', 'Woodrow', 'estate', 'executor', 'Peter', 'Knight', 'executor', 'Sus', 'Woodrow']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

In [56]:
tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'James Camble NO 7015 is not on file & in supposed to the number not delivered')].index
results = ['problem', 'James Camble NO 7015 is not on file & in supposed to the number not delivered']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'The rector of christ and st. peters churches')].index
results = ['organization', 'The rector of christ and st. peters churches']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Jere Kalbey & S Gaston to the Estate John Gaston de')].index
results = ['John', 'Gaston de', 'estate', 'executor', 'Jere', 'Kalbey', 'executor', 'S', 'Gaston']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==   'Jer Hals by & I Gaston Ex for the Estate of Jn Gaster')].index
results = ['Jn', 'Gaster', 'estate', 'executor', 'Jer', 'Hals', 'executor', 'I', 'Gaston']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'John Smith and James McDonald Guardian of the Heirs of John Gibson')].index
results = ['John', 'Gibson', 'heirs', 'executor', 'John', 'Smith', 'executor', 'James', 'McDonald']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'notes', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Ex of William Stadlerman and Pk Lickon Ex of William Stadlerman')].index
results = ['William', 'Stadlerman', 'executor', 'Pk', 'Lickon']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'Michael Schubert for St Michael and Zeus Corporation')].index
results = ['organization', 'St Michael and Zeus Corporation', 'executor', 'Michael', 'Schubert']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x == 'John Steinmetz for Corperetion of F Mich & Lions Cer')].index
results = ['organization', 'Corperetion of F Mich & Lions Cer', 'executor', 'John', 'Steinmetz']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x ==  'A and J J Caldwell for EvMitchell')].index
results = ['Ev', 'Mitchell', 'executor', 'A', 'Caldwell', 'executor', 'J J', 'Caldwell']
more_complicated.loc[tempindex, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = results

tempindex = more_complicated[more_complicated['First Name 1 '].apply(lambda x: x == 'Jon Steinmetz for St Michaels and Zions Cerpreatere')].index
results = ['organization', 'St Michaels and Zions Cerpreatere', 'executor', 'Jon', 'Steinmetz']
more_complicated.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

In [57]:
two_names.loc[more_complicated.index] = more_complicated
more_complicated_cleaned = two_names.loc[more_complicated.index]
#two_names.loc[more_complicated.index]

<h2> Handle corrections of and co. </h2>

In [58]:
has_co = two_names[two_names['First Name 1 '].apply(lambda x: x[-2:] == "Co" or "co." in x.lower() or " others " in x.lower() and not betterIsNan(x))]
has_co_names = pd.Series(has_co['First Name 1 '].unique())

In [59]:
has_co['original text'] = has_co['First Name 1 ']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_co['original text'] = has_co['First Name 1 ']


In [60]:
capt_sam_filter = [x == 'Capt Samuel Wilman and Co.' for x in has_co['First Name 1 ']]
has_co['First Name 1 '][capt_sam_filter] = "Samuel"
has_co['Last Name 1 '][capt_sam_filter] = "Wilman"
has_co['Title 1'][capt_sam_filter] = "Capt"

paris_bro_filter = [x == 'Paris Brothers and co.' for x in has_co['First Name 1 ']]
has_co['First Name 1 '][paris_bro_filter] = math.nan
has_co['Last Name 1 '][paris_bro_filter] = "Paris"
has_co['notes'][paris_bro_filter] = "brothers"

gs_dewint_filter = [x == 'GS.Dewint & Co' for x in has_co['First Name 1 ']]
has_co['Last Name 1 '][gs_dewint_filter] = "Dewint"
has_co['First Name 1 '][gs_dewint_filter] = "GS."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_co['First Name 1 '][capt_sam_filter] = "Samuel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_co['Last Name 1 '][capt_sam_filter] = "Wilman"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_co['Title 1'][capt_sam_filter] = "Capt"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_co['First Name 1 

In [61]:
spec_status_names = ['Capt Samuel Wilman and Co.', 'Paris Brothers and co.', 'GS.Dewint & Co']
spec_status_first_names = ['Samuel', math.nan, 'GS.']
post_and_sym = has_co_names.apply(lambda x: x[:x.lower().find('&')] if x not in spec_status_names else x)
post_and = has_co_names.apply(lambda x: x[:x.lower().find('and')] if x not in spec_status_names else x)
#merge results
pre_processing_co_names = pd.Series(y if len(x) >= len(y) else y for x, y in zip(post_and_sym, post_and))
#remove stop words
pre_processing_co_names = pre_processing_co_names.apply(lambda x: x.replace("&", "").replace("Co.", "").replace("co."," ").strip())
pre_processing_co_names = pre_processing_co_names.apply(lambda x: x.replace(" C", "").replace(" other", "").strip())

In [62]:
replace_co = dict(pd.Series(pre_processing_co_names.tolist(), index = has_co_names))
del replace_co[spec_status_names[0]]
del replace_co[spec_status_names[1]]
del replace_co[spec_status_names[2]]

In [63]:
change_names = [x not in spec_status_first_names for x in has_co['First Name 1 ']]

In [64]:
has_co.loc[change_names, 'First Name 1 '] = has_co['First Name 1 '][change_names].apply(lambda x: replace_co[x])
has_co.loc[change_names, 'Last Name 1 '] = [x.split(" ")[-1] for x in has_co['First Name 1 '][change_names]]
has_co.loc[change_names, 'First Name 1 '] = [" ".join(x.split(" ")[:-1]) for x in has_co['First Name 1 '][change_names]]

In [65]:
#special correction
has_co.loc[has_co[has_co['original text'] == 'George Hand & Co'].index, 'Last Name 1 '] = 'Hand'

In [66]:
two_names.loc[has_co.index] = has_co

<h2> Handling Heirs </h2>

In [67]:
def heirPresent(x):
    try:
        return "heir" in x.lower()
    except:
        return False

In [68]:
heirs = two_names[two_names['First Name 1 '].apply(lambda x: heirPresent(x))]
heirs_names = heirs['First Name 1 '].unique()

In [69]:
heirs

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
40272,6,1777,5,8.0,,Margarett Grant herself and heirs,,,,,,,,500,500.0,,,


In [70]:
heirs.loc[40272, 'First Name 1 '] = "Margaret"
heirs.loc[40272, 'Last Name 1 '] = "Grant"
heirs.loc[40272, 'notes'] = "heirs"

In [71]:
two_names.loc[heirs.index] = heirs
two_names.loc[heirs.index]

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
40272,6,1777,5,8.0,,Margaret,Grant,,,,,,,500,500.0,heirs,,


In [72]:
filter_out_names.extend(has_co.index.tolist())
filter_out_names.extend(heirs.index.tolist())
filter_out_names.extend(more_complicated.index.tolist())

In [73]:
filter_out_names = list(set(filter_out_names))

<h2> Handling names separated by & and "and" </h2>

In [74]:
and_sep_filter = [x not in filter_out_names for x in two_names.index]

In [75]:
def splitNames(x):
    x = x.replace(" And ", " and ")
    if "&" in x:
        return x.split("&")
    return x.split(" and ")

In [76]:
normal_join = two_names.loc[and_sep_filter]
normal_join['First Name 1 '].apply(lambda x: splitNames(x)).apply(lambda x: len(x)).value_counts()

2    1717
1       2
3       1
Name: First Name 1 , dtype: int64

In [77]:
normal_join['original text'] = normal_join['First Name 1 ']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_join['original text'] = normal_join['First Name 1 ']


In [78]:
joinedNames = normal_join['First Name 1 '].apply(lambda x: splitNames(x))

In [79]:
print(len(normal_join['First Name 1 '].unique()))
#normal_join['First Name 1 '].unique()

139


In [80]:
def parseNames(ind, val):
    if len(val) == 1:
        val = val[0]
        normal_join.loc[ind, 'Last Name 1 '] = val.split(" ")[-1].strip()
        normal_join.loc[ind, 'First Name 1 '] = " ".join(val.split(" ")[:-1]).strip()
    elif len(val) == 2:
        name1 = val[0].strip().split(" ")
        name2 = val[1].strip().split(" ")
        #normal handling
        if len(name1) == 1 and len(name2) == 1:
            
            normal_join.loc[ind, 'notes'] = normal_join.loc[ind, 'First Name 1 ']
            
            normal_join.loc[ind, 'Last Name 1 '] = name1[0]
            normal_join.loc[ind, 'First Name 1 '] = math.nan

            normal_join.loc[ind, 'Last Name 2'] = name2[0]
            normal_join.loc[ind, 'First Name 2'] = math.nan
        elif len(name1) == 1:
            normal_join.loc[ind, 'notes'] = normal_join.loc[ind, 'First Name 1 ']
            if 'Van ' in name2:
                normal_join.loc[ind, 'Last Name 1 '] = name1[0]
                normal_join.loc[ind, 'First Name 1 '] = np.nan

                normal_join.loc[ind, 'Last Name 2'] = " ".join(name2)
            else:
                normal_join.loc[ind, 'Last Name 1 '] = name2[-1].strip()
                normal_join.loc[ind, 'First Name 1 '] = name1[0]

                normal_join.loc[ind, 'Last Name 2'] = name2[-1].strip()
                normal_join.loc[ind, 'First Name 2'] = " ".join(name2[:-1])
        elif len(name2) == 1:
            normal_join.loc[ind, 'notes'] = normal_join.loc[ind, 'First Name 1 ']
            endings = set(['Comp.', 'son', 'ex', 'others', 'Sons', 'Son'])
            if 'Van ' in name1:
                normal_join.loc[ind, 'Last Name 1 '] = " ".join(name1)
                normal_join.loc[ind, 'First Name 1 '] = math.nan
                normal_join.loc[ind, 'Last Name 2'] = " ".join(name2)
                normal_join.loc[ind, 'First Name 2'] = math.nan
            elif len(endings.intersection(set(name2))) > 0:
                normal_join.loc[ind, 'Last Name 1 '] = name1[-1].strip()
                normal_join.loc[ind, 'First Name 1 '] = " ".join(name1[:-1])
            else:
                normal_join.loc[ind, 'First Name 1 '] = math.nan
                normal_join.loc[ind, 'Last Name 1 '] = name1[0].strip()
                normal_join.loc[ind, 'Last Name 2'] = name1[1].strip()
                normal_join.loc[ind, 'Last Name 3'] = name2[0].strip()
        else:
            normal_join.loc[ind, 'Last Name 1 '] = name1[-1].strip()
            normal_join.loc[ind, 'First Name 1 '] = " ".join(name1[:-1])
            
            normal_join.loc[ind, 'Last Name 2'] = name2[-1].strip()
            normal_join.loc[ind, 'First Name 2'] = " ".join(name2[:-1])
    else:
        name1 = val[0].split(" ")
        name2 = val[1].split(" ")
        name3 = val[2].split(" ")
        
        normal_join.loc[ind, 'Last Name 1 '] = name1[-1].strip()
        normal_join.loc[ind, 'First Name 1 '] = " ".join(name1[:-1])
            
        normal_join.loc[ind, 'Last Name 2'] = name2[-1].strip()
        normal_join.loc[ind, 'First Name 2'] = " ".join(name2[:-1])
          
        normal_join.loc[ind, 'Last Name 3'] = name3[-1].strip()
        normal_join.loc[ind, 'First Name 3'] = " ".join(name3[:-1])

In [81]:
for ind, val in joinedNames.items():
    parseNames(ind, val)

In [82]:
two_names.loc[normal_join.index] = normal_join
normal_join_cleaned = two_names.loc[normal_join.index]
#two_names.loc[normal_join.index]

In [83]:
#manual edits

In [84]:
normal_join_cleaned.loc[normal_join_cleaned[normal_join_cleaned['original text'] == 'Sam Ely and Jn Gallaher Esq'].index,
                        ['Title 2','First Name 2', 'Last Name 2']] = ['Esq', 'Jn', 'Gallaher']
normal_join_cleaned.loc[normal_join_cleaned[normal_join_cleaned['original text'] == 'Simon Dreisbach For Estates and John fox'].index,
                        ['First Name 1 ', 'Last Name 1 ']] = ['Simon', 'Dreisbach']

In [85]:
weird_first_names.loc[two_names.index] = two_names

In [86]:
filter_out_names.extend(normal_join.index)

In [87]:
filter_out_names.extend(two_names.index.tolist())
filter_out_names = list(set(filter_out_names))

<h3> Handling treasurers </h3>

In [88]:
#society's treasurer lyme - just society 
#get organization name from words after treasurer

In [89]:
def testString(string, match):
    try:
        return match in string.lower()
    except:
        return False

In [90]:
treasurer = weird_first_names[weird_first_names['First Name 1 '].apply(lambda x: testString(x, "treas") and x not in filter_out_names)]
treasurer.shape

(70, 18)

In [91]:
filter_out_names.extend(treasurer['First Name 1 '].unique().tolist())
treasurer['First Name 1 '].unique()

array(['Ebenezer Storer Treasurer H College',
       'Job Cushing Treasurer for Shrewsbury',
       'Jos Richards Treas 2nd Precinct Roxbury',
       'Dan Thurston Treasurer Church in Franklin',
       'Simeon Howard Treasurer Convention of Ministers',
       'Eli Root Treasurer of Pittsfield', "Society's Treasurer Lyme"],
      dtype=object)

In [92]:
treasurer['original text'] = treasurer['First Name 1 ']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treasurer['original text'] = treasurer['First Name 1 ']


In [93]:
#some long code handling treasurer replacements

In [94]:
tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  'Ebenezer Storer Treasurer H College')].index
results = ['organization', 'H College', 'treasurer', 'Ebenezer', 'Storer']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  'Job Cushing Treasurer for Shrewsbury')].index
results = ['organization', 'Shrewsbury', 'treasurer', 'Job', 'Cushing']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  'Jos Richards Treas 2nd Precinct Roxbury')].index
results = ['organization', '2nd Precinct Roxbury', 'treasurer', 'Jos', 'Richards']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  'Dan Thurston Treasurer Church in Franklin')].index
results = ['organization', 'Church in Franklin', 'treasurer', 'Dan', 'Thurston']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  'Simeon Howard Treasurer Convention of Ministers')].index
results = ['organization', 'Convention of Ministers', 'treasurer', 'Simeon', 'Howard']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  'Eli Root Treasurer of Pittsfield')].index
results = ['organization', 'Pittsfield', 'treasurer', 'Eli', 'Root']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2']] = results

tempindex = treasurer[treasurer['First Name 1 '].apply(lambda x: x ==  "Society's Treasurer Lyme")].index
results = ['organization', 'Society', 'treasurer', 'Lyme']
treasurer.loc[tempindex, ['Title 1', 'First Name 1 ', 'Title 2', 'Last Name 2']] = results

In [95]:
weird_first_names.loc[treasurer.index] = treasurer
treasurer_cleaned = weird_first_names.loc[treasurer.index]
#weird_first_names.loc[treasurer.index]

In [96]:
filter_out_names.extend(treasurer_cleaned.index.tolist())

<h2> Handling everything else </h2>

In [97]:
other_cases = weird_first_names[pd.Series(weird_first_names.index).apply(lambda x: x not in filter_out_names).values]
other_cases.shape

(1662, 18)

In [98]:
#len(other_cases['First Name 1 '].unique())

In [99]:
nlp = spacy.load("en_core_web_sm")
other_fixed_names = []
for entity in other_cases['First Name 1 ']:
    doc = nlp(entity)
    if len(doc.ents) == 1:
        ent = doc.ents[0]
        if ent.label_ == "PERSON" and ent.text == entity:
            other_fixed_names.append(ent.text)
        elif ent.label == "ORG" and ent.text == entity:
            other_fixed_names.append(ent.text)
        else:
            other_fixed_names.append(False)
    elif len(doc.ents) == 2:
        ent = doc.ents[0]
        ent2 = doc.ents[1]
        if ent.label_ == "Person" and ent2.label_ == "Person" and (ent.text + " " + ent2.text) == entity:
            other_fixed_names.append(ent.text + " " + ent2.text)
        else:
            other_fixed_names.append(False)
    else:
        other_fixed_names.append(False)

In [100]:
#what is william thomas exos james ecum
print(sum([1 if x is False else 0 for x in other_fixed_names]))
#set(other_fixed_names)

435


<h3> Proper names that are just misplaced </h3>

In [101]:
indices = [not x if x is False else False for x in other_fixed_names]
indices_neg = [not x for x in indices]

In [102]:
swap_names = pd.Series(other_fixed_names)[indices_neg]

In [103]:
other_cases.loc[indices_neg, 'First Name 1 '] = swap_names
other_cases.loc[indices_neg, 'Last Name 1 '] = [x.split(" ")[-1] for x in swap_names]
other_cases.loc[indices_neg, 'First Name 1 '] = [" ".join(x.split(" ")[:-1]) for x in swap_names]

In [104]:
def removeStringDuplicates(string):
    return " ".join(sorted(set(string.split()), key=string.split().index))

In [105]:
weird_first_names.loc[other_cases[indices_neg].index] = other_cases.loc[indices_neg]
weird_first_names.loc[other_cases[indices_neg].index, 'First Name 1 '] = weird_first_names.loc[other_cases[indices_neg].index]['First Name 1 '].apply(lambda x: removeStringDuplicates(x))

In [106]:
weird_first_names.loc[other_cases[indices_neg].index]

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
10301,2,1779,1,28.0,,John Davis,Williams,,,,,,,500,58.791670,,,
34954,5,1779,3,31.0,,John Treat,Crane,,,,,,,500,45.423610,,,
35442,5,1779,4,21.0,,Samuel Wright,Hortsham,,,,,,,400,33.983330,,,
35798,5,1779,5,4.0,,Abraham,Haring,,,,,,,400,32.600000,,,
35805,5,1779,5,4.0,,Abraham,Haring,,,,,,,300,24.450000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80895,9,1780,2,1.0,,Jesse,Taylor,,,,,,,300,9.020833,,,
80906,9,1779,3,19.0,,William,Bradley,,,,,,,1000,94.402780,,,
80907,9,1779,3,19.0,,Samuel,Oldham,,,,,,,1000,94.402780,,,
80908,9,1780,1,1.0,,John,Hay,,,,,,,400,13.616670,,,


<h2> Handling other weird names/organizations </h2>

In [107]:
other_cases_fixed = pd.read_csv('../Data/export_weird_names.csv')

In [108]:
index_indices = other_cases[indices].index

In [109]:
for ind in index_indices:
    fname = other_cases.loc[ind, 'First Name 1 ']
    series = other_cases_fixed[other_cases_fixed['original name'].apply(lambda x: x == fname)].drop('original name', axis = 1).values.flatten().tolist()
    other_cases.loc[ind, ['Title 1','First Name 1 ','Last Name 1 ','Title 2','First Name 2','Last Name 2',
                          'Title 3','First Name 3','Last Name 3','notes','original text']] = series      

In [110]:
other_cases[indices]

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
5804,2,1778,1,1.0,organization,The Navy Board Eastern Department,,,,,,,,600,411.466700,,The Navy Board Eastern Department,
5805,2,1778,1,1.0,organization,The Navy Board Eastern Department,,,,,,,,600,411.466700,,The Navy Board Eastern Department,
5806,2,1778,1,1.0,organization,The Navy Board Eastern Department,,,,,,,,500,342.888900,,The Navy Board Eastern Department,
5807,2,1778,1,1.0,organization,The Navy Board Eastern Department,,,,,,,,500,342.888900,,The Navy Board Eastern Department,
5808,2,1778,1,1.0,organization,The Navy Board Eastern Department,,,,,,,,500,342.888900,,The Navy Board Eastern Department,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80855,9,1779,3,29.0,,Michl,Stump,,,,,,,300,27.429170,,Michl Stump,
80856,9,1779,3,29.0,,Michl,Stump,,,,,,,800,73.144440,,Michl Stump,
80858,9,1779,5,16.0,,Captn C,Mossby,,,,,,,49,3.843097,,Captn C Mossby,
80860,9,1779,3,9.0,,Fras,Hobday,,,,,,,140,13.646110,,Fras Hobday,


In [111]:
weird_first_names.loc[other_cases.index] = other_cases
other_cases_cleaned = weird_first_names.loc[other_cases.index]

## Move weird_first_names back to laons

In [112]:
loans.loc[weird_first_names.index] = weird_first_names

In [113]:
#add additional row that is the same as the first when last name 2 applies
#add index indicating shared

In [114]:
(loans[[not x for x in loans['Last Name 2'].apply(betterIsNan)]][['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2']].drop_duplicates()).shape

(373, 4)

In [115]:
loans[[not x for x in loans['Last Name 2'].apply(betterIsNan)]][['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2']].shape

(3109, 4)

In [116]:
#checking if there are any names in first name column that are two first names
#check confirmed - also shows that there are no NA first name or last name columns
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(loans[[not x for x in loans['Last Name 2'].apply(betterIsNan)]][['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2']].drop_duplicates())

Unnamed: 0,First Name 1,Last Name 1,First Name 2,Last Name 2
2253,Abraham,Livingston,William,Turnbull
2428,,Lee,,Jones
2759,,Clark,,Nightingale
2822,,Pitts,,Call
3644,,Bryant,,Dennie
3749,Sam,Hasting,Sam,Austin
3807,,Hubbart,,Greene
4000,Sarah,Green,John,Gray
4017,John,Gray,Thomas,Dawes
4233,,Hunt,,Sherburne


## Final Manual Replacement

In [117]:
repl_index = loans[loans['First Name 1 '].apply(lambda x: not pd.isnull(x) and x == 'Peter & Anna Maricha')].index
loans.loc[repl_index, 'original text'] = 'Peter & Anna Maricha Jay'
loans.loc[repl_index, ['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2']] = ['Peter', 'Jay', 'Anna', 'Maricha Jay']
repl_index = loans[loans['First Name 1 '].apply(lambda x: not pd.isnull(x) and x == 'Myndert & Mary')].index
loans.loc[repl_index, 'original text'] = 'Myndert & Mary Van Schaick'
loans.loc[repl_index, ['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2']] = ['Myndert', 'Van Schaick', 'Mary', 'Van Schaick']

In [118]:
repl_index = loans[[not pd.isnull(x) and '&' in x and pd.isnull(y) for x, y in zip(loans['First Name 1 '], loans['Title 1'])]].index
loans.loc[repl_index, 'original text'] = [x + " " + y for x, y in zip(loans.loc[repl_index, 'First Name 1 '], loans.loc[repl_index, 'Last Name 1 '])]
loans.loc[repl_index, ['Last Name 2']] = loans.loc[repl_index, 'Last Name 1 ']
loans.loc[repl_index, ['Last Name 1 ']] = loans.loc[repl_index, 'First Name 1 '].apply(lambda x: x.replace("&", "")).tolist()
loans.loc[repl_index, ['First Name 1 ']] = [np.nan] * len(repl_index)

In [119]:
repl_index = loans[loans['Last Name 1 '].apply(lambda x: not pd.isnull(x) and '&' in x)].index
loans.loc[repl_index, 'original text'] = [x + " " + y for x, y in zip(loans.loc[repl_index, 'First Name 1 '], loans.loc[repl_index, 'Last Name 1 '])]
loans.loc[repl_index, ['First Name 1 ']] = 'Samuel'
loans.loc[repl_index, ['Last Name 1 ']] = 'Grant'
loans.loc[repl_index, ['notes']] = 'son'

In [120]:
repl_index = loans[loans['Last Name 1 '].apply(lambda x: x == 'Gordon Treasurer to the Convention of Ministers')].index
loans.loc[repl_index, ['Title 1', 'Last Name 1 ', 'notes', 'original text']] = ['Treasurer', 'Gordon', 'Convention of Ministers', 
                                                                               'William Gordon Treasurer to the Convention of Ministers']
repl_index = loans[loans['Last Name 1 '].apply(lambda x: x == 'Thurston Treas Church Wrentham')].index
loans.loc[repl_index, ['Title 1', 'Last Name 1 ', 'notes', 'original text']] = ['Treasurer', 'Thurston', 'Church Wrentham', 
                                                                                'Daniel Thurston Treas Church Wrentham']
repl_index = loans[loans['Last Name 1 '].apply(lambda x: x == 'Biglow Guardian to her Children')].index
loans.loc[repl_index, ['Title 1', 'Last Name 1 ', 'notes', 'original text']] = ['Guardian', 'Biglow', 'children', 
                                                                                'Anna Biglow Guardian to her Children']
repl_index = loans[loans['Last Name 1 '].apply(lambda x: x == 'Miller Treas Westminster')].index
loans.loc[repl_index, ['Title 1', 'Last Name 1 ', 'notes', 'original text']] = ['Treas', 'Miller', 'Westminster', 
                                                                                'Joseph Miller Treas Westminster']
repl_index = loans[loans['Last Name 1 '].apply(lambda x: x == 'Allen Executor to B Winchester')].index
loans.loc[repl_index, ['Title 1', 'Last Name 1 ', 'First Name 2', 'Last Name 2', 'original text']] = ['Executor', 'Allen', 'B', 'Winchester', 
                                                                                                      'Joseph Allen Executor to B Winchester']
repl_index = loans[loans['Last Name 1 '].apply(lambda x: x == 'Estates')].index 
loans.loc[repl_index, ['First Name 1 ', 'Last Name 1 ', 'notes']] = ['Simon', 'Dreisbach', 'Estates']

In [121]:
repl_index = loans[loans['original text'].apply(lambda x: x == 'Justus of the academy of New York')].index 
loans.loc[repl_index, 'First Name 1 '] = 'Justus of the academy of New York'
repl_index = loans[loans['First Name 1 '].apply(lambda x: x == 'Corperetion of F Mich & Lions Cer')].index 
loans.loc[repl_index, ['First Name 1 ', 'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3', 'First Name 3', 'Last Name 3']] = ['F', 'Mich', np.nan, 'Lions', 'Cer', 'executor', 'John', 'Steinmetz']
repl_index = loans[loans['original text'].apply(lambda x: x == 'Gilbert Hammond and Cornelius Tommand')].index 
loans.loc[repl_index, ['First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2', 'original text']] = ['Gilbert', 'Hammond', 'Cornelius', 'Tommand', 'Sarah Charman executor for Gilbert Hammond and Cornelius Tommand']
repl_index = loans[loans['original text'].apply(lambda x: not pd.isnull(x) and 'Christian Weedman & Samuel Harris Esq Mathia Birkly' in x)].index
loans.loc[repl_index, ['First Name 2', 'Last Name 2']] = ['Samuel', 'Harris']

In [122]:
tempindex = loans[loans['First Name 1 '] == 'Catherine Elizabeth and Sarah'].index
loans.loc[tempindex, ['First Name 1 ', 'First Name 2', 
                      'Last Name 2', 'First Name 3', 
                      'Last Name 3', 'original text']] = ['Catherine', 'Elizabeth', 'Depeyster', 
                                                          'Sarah', 'Depeyster', 'Catherine Elizabeth and Sarah Depeyster']
tempindex = loans[loans['First Name 1 '] == 'Samuel and John'].index
loans.loc[tempindex, ['First Name 1 ', 'First Name 2', 
                      'Last Name 2', 'original text']] = ['Samuel', 'John', 'HansDelap', 
                                                          'Samuel and John']
tempindex = loans[loans['First Name 1 '] == 'Sam and Rob'].index
loans.loc[tempindex, ['First Name 1 ', 'First Name 2', 
                      'Last Name 2', 'original text']] = ['Sam', 'Rob', 'Paweance', 
                                                          'Sam and Rob']
tempindex = loans[loans['First Name 1 '] == 'Robert and John'].index
loans.loc[tempindex, ['First Name 1 ', 'First Name 2', 
                      'Last Name 2', 'original text']] = ['Robert', 'John', 'Wayne', 
                                                          'Robert and John']
tempindex = loans[loans['First Name 1 '] == 'Levin and Jos'].index
loans.loc[tempindex, ['First Name 1 ', 'First Name 2', 
                      'Last Name 2', 'original text']] = ['Levin', 'Jos', 'Derickson', 
                                                          'Levin and Jos']

## Handle Businesses

In [123]:
business_names_raw = loans[[pd.isnull(x) and pd.isnull(y) and pd.isnull(z) for x, y, z in zip(loans['First Name 1 '], loans['First Name 2'], loans['First Name 3'])]]

In [124]:
business_names_raw['original text'].apply(lambda x: x.replace(' and ', ' & ') if not pd.isnull(x) else x)

374                      NaN
414                      NaN
492                      NaN
523                      NaN
524                      NaN
                ...         
80901         Doerner & Mark
80902         Doerner & Mark
80903         Doerner & Mark
80904      Mitchell & Oldham
80912    Baker Blow & Oldham
Name: original text, Length: 1603, dtype: object

In [125]:
business_names_raw[['original text', 'Last Name 1 ', 'Last Name 2', 'Last Name 3']].drop_duplicates().to_csv("company_names_raw.csv", index = False)

In [126]:
business_names_cleaned = pd.read_csv("company_names_clean.csv")
business_names_cleaned = business_names_cleaned[business_names_cleaned.columns[:7]]

In [127]:
business_names_cleaned

Unnamed: 0,Record,Title,First Name 1,Last Name 1,First Name 2,Last Name 2,Notes
0,Col Pierce,Colonel,Samuel,Pierce,,,https://www.historicnewengland.org/explore/col...
1,Halsey,Colonel,Jeremiah,Halsey,,,https://www.geni.com/people/Col-Jeremiah-Halse...
2,Dr Wetherspoon,,John,Witherspoon,,,
3,Lee & Jones,,,Lee,,Jones,
4,Clark & Nightingale,,John,Clark,Joseph,Nightingale,https://www.brown.edu/academics/public-humanit...
...,...,...,...,...,...,...,...
83,Milner & Haynes,,,Milner,,Haynes,
84,Mitchell & Graham,,,Mitchell,,Graham,
85,Smith & Gatewood,,,Smith,,Gatewood,
86,Doerner & Mark,,,Doerner,,Mark,


In [128]:
for ind in business_names_cleaned.index:
    record = business_names_cleaned.loc[ind]['Record']
    loc_index = loans[loans['original text'] == record].index
    data = business_names_cleaned.loc[ind][['Title', 'First Name 1 ', 'Last Name 1 ', 'First Name 2 ', 'Last Name 2 ', 'Notes']].tolist()
    loans.loc[loc_index, ['Title 1', 'First Name 1 ', 'Last Name 1 ', 'First Name 2', 'Last Name 2', 'notes']] = data

In [129]:
loans

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text,match eligible
0,1,1778,3,13.0,Col,Joshua,Wentworth,,,,,,,200,108.27780,,,
1,1,1777,9,2.0,,Charles,Treadwell,,,,,,,200,199.37780,,,
2,1,1777,9,10.0,,Stephen,Cleverly,,,,,,,200,194.51110,,,
3,1,1777,9,13.0,,David,Griffith,,,,,,,200,192.71110,,,
4,1,1777,9,15.0,,John,Mansfield,,,,,,,200,191.52220,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80908,9,1780,1,1.0,,John,Hay,,,,,,,400,13.61667,,,
80909,9,1779,5,25.0,,Isaac,Smith,,Thoroughgood,Smith,,,,300,22.86250,Isaac & Thoroughgood Smith,Isaac & Thoroughgood Smith,
80910,9,1779,5,25.0,,Isaac,Smith,,Thoroughgood,Smith,,,,800,60.96667,Isaac & Thoroughgood Smith,Isaac & Thoroughgood Smith,
80911,9,1779,3,13.0,,Samuel,Oldham,,,,,,,500,48.11806,,,


In [130]:
loans.to_csv('../Data/Pre1790/cleaned/loan_office_certificates_9_states_cleaned.csv')

# CLEAN MARINE CERTIFICATES

In [131]:
marine = pd.read_excel("../Data/Pre1790/Marine_Liquidated_Debt_Certificates.xlsx", header = 11)
cols = marine.columns = ["Page","JPEG number","Number","Letter","Date of the Certificate: Month",
                         "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name",
                         "Title","Time when the debt became due: Month","Time when the debt became due: Day",
                         "Time when the debt became due: Year","Dollars","90th","Total Dollars_1","Total Dollars_2",
                         "Line Strike Thorugh: Yes?","Line Strike Thorugh: Note","Notes"]
marine.columns = cols

  warn(msg)


In [132]:
marine['original text'] = np.nan
marine['cleaning notes'] = np.nan
marine['first name 2'] = np.nan
marine['last name 2'] = np.nan

In [133]:
repl_index = marine[marine['First name'].apply(lambda x: not pd.isnull(x) and 'dee' in x.lower())].index
marine.loc[repl_index, ['First name', 'Last name', 'cleaning notes', 'original text']] = ['John', 'Young', 'estate, deceased', 
                                                                                          'The Estate of John Young dee The Estate of John Young Deceased']

In [134]:
repl_index = marine[marine['First name'].apply(lambda x: not pd.isnull(x) and 'deceased' in x.lower())].index
marine.loc[repl_index, ['cleaning notes']] = ['estate, deceased']
marine.loc[repl_index, ['original text']] = marine['First name']
marine.loc[repl_index, ['First name']] = marine.loc[repl_index, 'First name'].apply(lambda x: x.lower().replace("estate of", "").replace("deceased", "").strip())

marine.loc[repl_index, ['Last name']] = marine.loc[repl_index, 'First name'].apply(lambda x: x.split(" ")[1])
marine.loc[repl_index, ['First name']] = marine.loc[repl_index, 'First name'].apply(lambda x: x.split(" ")[0])

In [135]:
repl_index = marine[marine['First name'].apply(lambda x: not pd.isnull(x) and 'estate' in x.lower())].index
marine.loc[repl_index, ['cleaning notes']] = ['estate']
marine.loc[repl_index, ['original text']] = marine['First name']
marine.loc[repl_index, ['First name']] = marine.loc[repl_index, 'First name'].apply(lambda x: x.lower().replace("estate", "").replace("the", "").replace("of","").strip())
marine.loc[repl_index, ['Last name']] = marine.loc[repl_index, 'First name'].apply(lambda x: x.split(" ")[-1])
marine.loc[repl_index, ['First name']] = marine.loc[repl_index, 'First name'].apply(lambda x: " ".join(x.split(" ")[:-1]))

In [136]:
repl_index = marine[marine['First name'].apply(lambda x: x == 'Moses Bush & Sons')].index
marine.loc[repl_index, ['First name', 'Last name', 'cleaning notes', 'original text']] = ['Moses', 'Bush', 'sons', 'Moses Bush & Sons']
repl_index = marine[marine['First name'].apply(lambda x: x == 'J Mc Nesbitt & Co')].index
marine.loc[repl_index, ['First name', 'Last name', 'cleaning notes', 'original text']] = ['J Mc', 'Nesbitt', 'Co', 'J Mc Nesbitt & Co']

In [137]:
repl_index = marine[marine['First name'].apply(lambda x: x == 'Hoov and Harrison')].index
marine.loc[repl_index, ['First name', 'Last name', 'last name 2', 'original text']] = [np.nan, 'Hoov', 'Harrison', 'Hoov and Harrison']

In [138]:
repl_index = marine[marine['Last name'].apply(lambda x: not pd.isnull(x) and 'deceased' in x.lower())].index
marine.loc[repl_index, 'cleaning notes'] = 'deceased'
marine.loc[repl_index, ['Last name']] = marine.loc[repl_index, 'Last name'].apply(lambda x: x.lower().replace("Deceased", "").strip())

In [139]:
repl_index = marine[marine['Last name'].apply(lambda x: not pd.isnull(x) and 'Weaver deed' == x)].index
marine.loc[repl_index, 'cleaning notes'] = 'deed'
marine.loc[repl_index, ['Last name']] = ['Weaver']

In [140]:
marine.to_csv('../Data/Pre1790/cleaned/Marine_Liquidated_Debt_Certificates_cleaned.csv')

# CLEAN PIERCE CERTIFICATES

In [141]:
pierce = pd.read_excel("../Data/Pre1790/Pierce_Certs_cleaned_2019.xlsx")

In [142]:
pierce = pierce[pierce['First'].apply(lambda x: type(x) != bool)]

In [143]:
repl_index = pierce[pierce['Last'].apply(lambda x: type(x) == bool)].index
pierce.loc[repl_index, 'Last'] = "True"

In [144]:
pierce = pierce[pierce['Last'].apply(lambda x: not pd.isnull(x))]

In [145]:
pierce['First'] = pierce['First'].apply(lambda x: x.strip() if not pd.isnull(x) else x)
pierce['Last'] = pierce['Last'].apply(lambda x: x.strip() if not pd.isnull(x) else x)

In [146]:
pierce['Last 2'] = np.nan
pierce['First 2'] = np.nan
pierce['original text'] = np.nan

In [147]:
repl_index = pierce[pierce['First'].apply(lambda x: x == 'P. & J.')].index
pierce.loc[repl_index, ['First', 'Last', 'First 2', 'Last 2', 'original text']] = ['P.', 'Bemant', 'J.', 'Porter', 'P. & J. Bemant &. Porter']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'S & Delano')].index
pierce.loc[repl_index, ['First', 'Last', 'First 2', 'Last 2', 'original text']] = ['S', 'Darting', 'Delano', 'Darting', 'S & Delano Darting']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'Benjamin & Donnelly')].index
pierce.loc[repl_index, ['Last', 'Last 2', 'original text']] = ['Benjamin', 'Donnelly', 'Benjamin & Donnelly X']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'Thomas G Jr.')].index
pierce.loc[repl_index, ['First', 'Last', 'original text']] = ['Thomas G', 'Alford Jr.', 'Thomas G Jr. Alford']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'Thomas G Sr.')].index
pierce.loc[repl_index, ['First', 'Last', 'original text']] = ['Thomas G', 'Alford Sr.', 'Thomas G Sr. Alford']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'Tho G Jr.')].index
pierce.loc[repl_index, ['First', 'Last', 'original text']] = ['Tho G', 'Alvord Jr.', 'Tho G Jr. Alvord']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'Tho G Sr.')].index
pierce.loc[repl_index, ['First', 'Last', 'original text']] = ['Tho G', 'Alvord Sr.', 'Tho G Sr. Alvord']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'John (see Berrick)')].index
pierce.loc[repl_index, ['First', 'Last', 'original text']] = ['John', 'Berrick', 'John (see Berrick) Benrick']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'James (alias Cady)')].index
pierce.loc[repl_index, ['First', 'original text']] = ['James', 'James (alias Cady)']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'John F (?)')].index
pierce.loc[repl_index, ['First', 'original text']] = ['John F', 'John F (?) Conrad']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'M. for J. Jones')].index
pierce.loc[repl_index, ['First', 'original text']] = ['J.', 'M. for J. Jones Jones']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'William And Lewis')].index
pierce.loc[repl_index, ['First', 'Last', 'First 2', 'Last 2','original text']] = ['William', 'Rice', 'Lewis', 'Rice', 'William And Lewis Rice']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'alias Hodge S')].index
pierce.loc[repl_index, ['First', 'Last','original text']] = ['Hodge S', 'Rollins', 'alias Hodge S Rollins']
repl_index = pierce[pierce['CN'].apply(lambda x: x == 84601)].index
pierce.loc[repl_index, ['First', 'Last', 'Last 2','original text']] = [np.nan, 'Benjamin', 'Donnelly', 'Benjamin & Donnelly Benjamin']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'John P Jr.')].index
pierce.loc[repl_index, ['First', 'Last','original text']] = ['John P', 'Salter Jr.', 'John P Jr. Salter']
repl_index = pierce[pierce['First'].apply(lambda x: x == 'John P Sr.')].index
pierce.loc[repl_index, ['First', 'Last','original text']] = ['John P', 'Salter Sr.', 'John P Sr. Salter']

In [148]:
pierce[pierce['Last'].apply(lambda x: not pd.isnull(x) and len(x.split(" ")) > 2)]

Unnamed: 0,CN,Last,First,Value,Group,To Whom Issued,State,Officer,Last 2,First 2,original text
7167,26818,Blanchard & Russell,,375.0,40.0,Officers paid to November 1783,,1.0,,,
16503,65707,Code (or Coad),William,40.6,81.0,First New York Regiment Col. G. Van Shaick; M...,NY,0.0,,,
21787,1682,De La Mater,John,300.0,9.0,Officers under Gen. Armand paid to December ...,,1.0,,,
21788,66542,De la Mater,John,112.6,81.0,First New York Regiment Col. G. Van Shaick; M...,NY,0.0,,,
21789,33666,De La Rouerie,A. Ma,3724.33,49.0,Officers foreign paid to November 15 1783,F,1.0,,,
21790,33618,De La Rouerie,Amand,1306.83,48.0,Officers foreign paid to November 15 1783,F,1.0,,,
21791,33642,De La Rouerie,B. G. A.,905.66,48.0,Officers foreign paid to November 15 1783,F,1.0,,,
64458,67902,Peffer (or Pepper),Ge,33.3,85.0,Tenth (old) Pennsylvania Regiment paid to Jan...,PA,0.0,,,
64654,67902,Pepper (or Peffer),Ge,33.3,85.0,Tenth (old) Pennsylvania Regiment paid to Jan...,PA,0.0,,,
84977,65918,Van De Bogart,Nichol,44.66,81.0,First New York Regiment Col. G. Van Shaick; M...,NY,0.0,,,


In [149]:
repl_index = pierce[pierce['Last'].apply(lambda x: x == 'Blanchard & Russell')].index
pierce.loc[repl_index, ['Last', 'Last 2', 'original text']] = ['Blanchard', 'Russell', 'Blanchard & Russell']
repl_index = pierce[pierce['Last'].apply(lambda x: x == 'Code (or Coad)')].index
pierce.loc[repl_index, ['Last', 'First 2', 'Last 2', 'original text']] = ['Code', 'William', 'Coad', 'William Code (or Coad)']
repl_index = pierce[pierce['Last'].apply(lambda x: x == 'Peffer (or Pepper)')].index
pierce.loc[repl_index, ['Last', 'First 2', 'Last 2', 'original text']] = ['Peffer', 'Ge', 'Pepper', 'Ge Peffer (or Pepper)']

In [150]:
ind = pierce[pierce['Last'] == 'Pepper (or Peffer)'].index
pierce.drop(ind, inplace = True)

In [151]:
pierce.to_csv('../Data/Pre1790/cleaned/Pierce_Certs_cleaned_2021.csv')