In [1]:
import csv
import pandas as pd
import numpy as np
import itertools
from rapidfuzz import fuzz
import re


## Create Final Table - process scraped data

In [2]:
def deNaN(series):
    return series.apply(lambda x: "" if pd.isnull(x) else x)

In [3]:
df = pd.read_csv('CD_results.csv', index_col = 0)
df['Search Name'] = df['Search Name'].apply(lambda x: [x.split('\xa0')[0]] if '[\'' not in x else x.replace('\'','').replace('\"','').strip('][').split(', '))
df['Search Name'] = df['Search Name'].apply(lambda x: ", ".join(x))

# add occupations
occ_ind = df[df['Search Name'].apply(lambda x: '(' in x)].index
df.loc[occ_ind, 'Title'] = df.loc[occ_ind, 'Search Name'].apply(lambda x: re.findall('\(.*?\)', x)[0])
df['Search Name'] = [sn_list.replace(title, '')  if not pd.isnull(title) else sn_list for sn_list, title in zip(df['Search Name'], df['Title'])]

# add esquire roles
ind = df[df['Search Name'].apply(lambda ele: ',' in ele and 'esq' in ele.lower())].index
df.loc[ind, 'Search Name'] = df.loc[ind, 'Search Name'].apply(lambda ele: ele.replace('Esquire', '').replace(', Esqr', '').replace(', Esq.', '').replace(', Esq','').replace('  ', ' ').strip())
df.loc[ind, 'Title'] = 'Esquire'

# resolve remaining
ind = df[df['Search Name'].apply(lambda x: any([',' in ele for ele in x]))].index

# resolve remaining
ind = df[df['Search Name'].apply(lambda x: any([',' in ele for ele in x]))].index
df.loc[ind, 'Search Name'] = ['Josiah Phelps Junior', 'John Johnson', 'Josiah Bartlett',
                              'David Lenox', 'David Lenox',
                              'Chrisr Marshall Junior', 'Jacob Laverswyler', 'Beriah Brown II']
df.loc[df[df['Search Name'] == 'Josiah Bartlett'].index, 'Title'] = 'Esquire'
df.loc[df[df['Search Name'] == 'David Lenox'].index, 'Title'] = 'Colonel'
# convert from list to string
df['Search Name'] = df['Search Name'].apply(lambda x: x[0] if type(x) == list else x)

In [4]:
# manual mistake...
df.loc[df[df['Search Name'].apply(lambda x: 'Andrew Douglass' in x)].index, ['town', 'county', 'name_type']] = [np.nan, 'Berks County', 'county']

In [5]:
df.loc[df[df['Search Name'] == "John Otto"].index, 'town'] = 'Reading'

In [6]:
statedict = {'PA':'Pennsylvania', 'CT':'Connecticut', 'MA':'Massachusetts', 'NH':'New Hampshire', 'DE':'Delaware',
             'NC':'North Carolina', 'GA':'Georgia', 'NY':'New York', 'NJ': 'New Jersey', 'RI':'Rhode Island',
             'VA':'Virginia', 'MD':'Maryland', 'SC':'South Carolina', 'VT':'Vermont'}

In [7]:
def processLocationString(name_type, town, county, state):
    if name_type == "town":
        return town + ", " + county.replace('County', '').strip() + ", " + statedict[state]
    elif name_type == "county":
        return county.replace('County', '').strip() + ", " + statedict[state]
    elif name_type == "state" or name_type == "state_flag":
        return statedict[state]
    else:
        return "United States"

In [8]:
df['Original Name'] = df['Original Name'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))
df['Original Name2'] = df['Original Name'].apply(lambda x: str(x))

# preprocess slavecount oclumns
df.loc[df[df['Family Size'].apply(lambda x: '\xa0' in x if not pd.isnull(x) else False)].index, 'Family Size'] = np.nan

In [9]:
df['Combined Location'] = [processLocationString(nt, t, c, s) for l, nt, t, c, s in zip(df['Location'], df['name_type'],
                                                                                        df['town'], df['county'], df['state'])]

In [10]:
df['Title'] = df['Title'].apply(lambda x: x.replace('(', '').replace(')','') if not pd.isnull(x) else x)

In [11]:
# dictionary to convert between string and list version of data
str_convert = dict(zip(df['Original Name2'], df['Original Name']))
# find unique value for all of the columns - helps with consolidation
# use dict.fromkeys because it preserves order
df_reformat = df.groupby(['Index', 'Original Name2', 'Combined Location'])['Search Name'].unique().reset_index()
for col in ['Match Status', 'Match Reason', 'Location', 'Family Size', 'Slavecount', 'url', 'Title']:
    df_merge = df.groupby(['Index', 'Original Name2', 'Combined Location']).agg({col:list}).reset_index()
    df_merge[col] = df_merge[col].apply(lambda x: list(dict.fromkeys(x)))
    df_reformat = pd.merge(df_reformat, df_merge)
# convert back to original format
df_reformat['Original Name'] = df_reformat['Original Name2'].apply(lambda x: str_convert[x])
# preprocess names to prepare for merging
df_reformat['Original Name2'] = df_reformat['Original Name'].apply(lambda x: set([y.replace('\"', '') for y in x]))
df_reformat['Original Name2'] = df_reformat['Original Name2'].apply(lambda x: str(sorted(x)))

In [12]:
occ_table = pd.read_csv("../../S2022/occupational_analysis/avg_debt_occupation/occupation_crosswalk.csv", index_col = 0)
occ_dict = dict(zip(occ_table['occupation'], occ_table['new occupation']))

## Proess aggregated CD file

In [13]:
# NY loan data
CD_raw = pd.read_csv("../../Data/Post1790/Aggregated/raw/aggregated_CD_final.csv")
CD_raw['Name'] = CD_raw['Name'].apply(lambda x: set(x.replace('\'','').replace('\"','').strip('][').split(', ')))

# create name column string equivalent and dictionary to convert
CD_raw['Name_str'] = CD_raw['Name'].apply(lambda x: str(sorted(x)))
CD_dict = dict(zip(CD_raw['Name_str'], CD_raw['Name']))

# aggregate our reults - creat eaggregated dataframe
CD_raw['6p_Total'] = CD_raw['6p_Dollar'].fillna(0) + CD_raw['6p_Cents'].fillna(0)
CD_raw['6p_def_Total'] = CD_raw['6p_def_Dollar'].fillna(0) + CD_raw['6p_def_Cents'].fillna(0)
CD_raw['3p_Total'] = CD_raw['3p_Dollar'].fillna(0) + CD_raw['3p_Cents'].fillna(0)
CD_raw[['occupation']] = CD_raw[['occupation']].fillna('')
CD_raw['Raw Location'] = [processLocationString(nt, t, c, s) for nt, t, c, s in zip(CD_raw['name_type'], CD_raw['town'],
                                                                                CD_raw['county'], CD_raw['state'])]
CD_raw['occupation'] = CD_raw['occupation'].apply(lambda x: occ_table.get(x, x))
CD_grouped = CD_raw.groupby(['Name_str', 'Raw Location']).agg({'6p_Total': ['sum', 'count'],
                                                               '6p_def_Total': 'sum',
                                                               '3p_Total': 'sum',
                                                               'occupation': list}).reset_index()
CD_grouped.columns = ['Name_str', 'Raw Location', '6p_Total', 'count', '6p_def_Total', '3p_Total', 'Occupation']

In [14]:
CD_grouped['Occupation'] = CD_grouped['Occupation'].apply(lambda x: list(dict.fromkeys(x)))
CD_grouped

Unnamed: 0,Name_str,Raw Location,6p_Total,count,6p_def_Total,3p_Total,Occupation
0,[''],Maryland,0.0,1,0.0,0.0,[]
1,[''],Pennsylvania,0.0,10,0.0,0.0,[]
2,[''],Rhode Island,0.0,7,0.0,0.0,[]
3,['10926 in Favor Johh Brown from the Register ...,Rhode Island,84.0,1,197.0,307.0,[]
4,['Aaron Bourn'],"Bristol, Bristol, Rhode Island",243.0,1,121.0,284.0,[Baker]
...,...,...,...,...,...,...,...
3231,['Zebulon Waterman'],"Colchester, New London, Connecticut",244.0,1,123.0,244.0,[Farmer]
3232,['Zephaniah Andrews'],"Providence, Providence, Rhode Island",1301.0,3,699.0,832.0,[Mason]
3233,['Zephaniah Brown'],"Providence, Providence, Rhode Island",1715.0,2,906.0,744.0,[Merchant]
3234,['Zephaniah Davis'],"Hebron, Tolland, Connecticut",98.0,1,98.0,74.0,[Farmer]


## Merge Data

In [15]:
# merge NY asset data and scraped data
CD_final = pd.merge(CD_grouped, df_reformat, left_on=['Name_str', 'Raw Location'], right_on = ['Original Name2', 'Combined Location'], how = 'left')

# filling in information for left joins
nanind = CD_final[CD_final['Name_str'].apply(lambda x: x == '[\'\']')].index
CD_final.loc[nanind, 'Original Name2'] = CD_final.loc[nanind, 'Name_str'].tolist()
CD_final.loc[nanind, ['Combined Location']] = CD_final.loc[nanind, 'Raw Location'].tolist()
CD_final.loc[nanind, 'Search Name'] = CD_final.loc[nanind, 'Name_str'].tolist()
CD_final.loc[nanind, 'Original Name'] = ['' for x in range(len(nanind))]
CD_final.loc[nanind, 'Family Size'] = [np.nan for x in range(len(nanind))]
CD_final.loc[nanind, 'Slavecount'] = [np.nan for x in range(len(nanind))]
CD_final.loc[nanind, 'Location'] = [np.nan for x in range(len(nanind))]
CD_final.loc[nanind, ['Match Reason']] = ['No Match Found' for x in range(len(nanind)) ]
CD_final.loc[nanind, ['Match Status']] = ['No Match' for x in range(len(nanind))]

In [16]:
for col in ['Match Status', 'Match Reason', 'Location', 'Family Size', 'Slavecount', 'url', 'Title', 'Occupation']:
    CD_final[col] = CD_final[col].apply(lambda x: [x] if type(x) != list else x)

In [17]:
CD_final['Original Name'] = CD_final['Original Name2'].apply(lambda x: CD_dict.get(x, ['']))
CD_final['occupation'] = CD_final['Raw Location'].apply(lambda x: occ_dict.get(x, x))
CD_final.drop(['Original Name2', 'Combined Location'], axis = 1, inplace = True)
# re-string Search Name Column
CD_final['Name_str'] = CD_final.apply(lambda x: str(sorted(x['Search Name'])) if type(x['Search Name']) != float else x['Name_str'], axis = 1)

In [18]:
# names that we will match
sn_series = CD_final['Search Name'].explode().drop_duplicates()
sn_names = [sn for sn in CD_final['Search Name'].tolist() if len(sn) < 2]
sn_series = [x for x in sn_series[sn_series.apply(lambda x: [x] in sn_names)]]

In [19]:
# use fuzzy string matching to see if the same name appears multiple times spelled slightly differently
elements = sn_series
results = [[name, [], 0] for name in elements]

for (i, element) in enumerate(elements):
    for (j, choice) in enumerate(elements[i+1:]):
        if fuzz.ratio(element, choice, score_cutoff=85):
            results[i][2] += 1
            results[i][1].append(choice)
            results[j+i+1][2] += 1
            results[j+i+1][1].append(element)
# remove names with no matches
match_df = pd.DataFrame(results, columns=['name', 'duplicates', 'duplicate_count'])

match_df['dup_list'] = [[name] + dup for name, dup in zip(match_df['name'], match_df['duplicates'])]
match_df['dup_list_str'] = match_df['dup_list'].apply(lambda x: str(sorted(x)))
match_df = match_df[match_df['duplicate_count'] > 0]

# create duplicate list
# convert to string format, create match reason column

mr_dict = dict(zip(CD_final[CD_final['Search Name'].apply(lambda x: len(x) == 1)]['Search Name'].apply(lambda x: x[0]),
                   CD_final[CD_final['Search Name'].apply(lambda x: len(x) == 1)]['Match Reason']))
location_dict = dict(zip(CD_final[CD_final['Search Name'].apply(lambda x: len(x) == 1)]['Search Name'].apply(lambda x: x[0]),
                   CD_final[CD_final['Search Name'].apply(lambda x: len(x) == 1)]['Raw Location']))
match_df['match reason'] = match_df['name'].apply(lambda x: mr_dict[x][0])
match_df['location'] = match_df['name'].apply(lambda x: location_dict[x])

In [20]:
names = ['Benjn  Loxley','Benjamin Loxley','Elial Williams','Elias Williams','Joseph Greer','Joseph Grier',
         'Joseph R Tatem','Joseph  Tatem','Naphtali Phillips','Napthali Phillips','Robert Porter','Robt  Porter',
         'Edward Trescott','Edward Trescot', 'Archibald Harvey', 'Archibald Hervey', 'Andrew  Caldwell', 'andrew Caldwell',
         'Daniel Tefft', 'Daniel Tofft', 'Frantz Jacob Foltz', 'Franz Jacob Follz', 'John Bleakley','John Bleakly',
         'Mary Muncreef','Mary Muncrief', 'Peter Heisler','Peter Heister','John Bleakley','John Bleakly']
grouped_df = match_df[match_df['name'].apply(lambda x: x in names)]
grouped_df.loc[grouped_df[grouped_df['name'] == 'Elias Williams'].index, 'duplicates'] = ['Elial Williams']
grouped_df.loc[grouped_df[grouped_df['name'] == 'Joseph Greer'].index, 'duplicates'] = ['Joseph Grier']
grouped_df['dup_list'] = [([dup] if type(dup) != list else dup) + [name] for dup, name in zip(grouped_df['duplicates'], grouped_df['name'])]
# list to manually remove certain matches
final_merge = grouped_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped_df['dup_list'] = [([dup] if type(dup) != list else dup) + [name] for dup, name in zip(grouped_df['duplicates'], grouped_df['name'])]


In [21]:
fuzzy_merge_dict = dict(zip(final_merge['name'].apply(lambda x: str([x])),
                            final_merge['dup_list_str'].astype(str)))
# add in merged data
CD_final['Name_str_new'] = CD_final['Name_str'].apply(lambda x: fuzzy_merge_dict.get(x, x))

In [22]:
CD_grouped = CD_final.groupby(['Name_str_new', 'Raw Location']).agg({'count':'sum', '6p_Total':'sum', '6p_def_Total':'sum', '3p_Total':'sum',
                                                                     'Family Size': list,  'Location': list, 'Match Reason': list,
                                                                     'Match Status': list, 'Slavecount': list, 'url': list,
                                                                     'Name_str': list, 'Occupation': list, 'Title': list}).reset_index()
# remove duplicates, flatten string
for cols in ['Family Size', 'Location', 'Match Reason', 'Match Status', 'Slavecount', 'url', 'Occupation', 'Title']:
    CD_grouped[cols] = CD_grouped[cols].apply(lambda x: list(dict.fromkeys(list(itertools.chain(*x)))))

# change to list format
namestr_dict = dict(zip(CD_final['Name_str'], CD_final['Search Name']))
CD_grouped['Name_str'] = CD_grouped['Name_str'].apply(lambda x: list(dict.fromkeys(list(itertools.chain(*[namestr_dict[ele] for ele in x])))))
CD_grouped.rename({'Name_str': 'Original Search Names'}, axis = 1, inplace = True)

# aggregate data - group by search name
CD_grouped['Name_str_new'] = CD_grouped['Original Search Names'].apply(lambda x: [max(x, key=len)] if x in final_merge['dup_list'].tolist() else x)
CD_grouped.rename({'Name_str_new': 'Search Names'}, axis = 1, inplace = True)

## Eliminate duplicate names in a list of names

In [23]:
CD_table = CD_grouped.copy()
CD_table['Search Names'] = CD_grouped['Search Names'].apply(lambda x: [x] if type(x) != list else x)
CD_table.loc[0:2, 'Search Names'] = CD_table.loc[0:2, 'Search Names'].apply(lambda x: [''])

In [24]:
# processing names where search names length is 4
name_length = 5
# processing search names length of 2
names = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Names']
locations = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Raw Location']
namedict = dict(zip([str(n) for n in names], names))
locationdict = dict(zip([str(n) for n in names], locations))

remlist = []
for name_str, name in namedict.items():
    match = False
    ind = CD_table[[str(sn) == name_str and loc == locationdict[name_str] for sn, loc in zip(CD_table['Search Names'], CD_table['Raw Location'])]].index
    for n in name:
        if not match:
            mr = df[[sn == n and loc == locationdict[name_str] for sn, loc in zip(df['Search Name'], df['Combined Location'])]]['Match Reason'].tolist()[0]
            if mr == 'Full Match':
                CD_table.loc[ind, 'Search Name'] = n
                match = True

In [25]:
# processing names where search names length is 4
name_length = 4
# processing search names length of 2
names = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Names']
locations = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Raw Location']
namedict = dict(zip([str(n) for n in names], names))
locationdict = dict(zip([str(n) for n in names], locations))

remlist = []
for name_str, name in namedict.items():
    match = False
    ind = CD_table[[str(sn) == name_str and loc == locationdict[name_str] for sn, loc in zip(CD_table['Search Names'], CD_table['Raw Location'])]].index
    mr_list = CD_table.loc[ind, 'Match Reason'].tolist()[0]
    for n in name:
        if not match:
            mr = df[[sn == n and loc == locationdict[name_str] for sn, loc in zip(df['Search Name'], df['Combined Location'])]]['Match Reason'].tolist()[0]
            if mr == 'Full Match':
                CD_table.loc[ind, 'Search Name'] = n
                match = True
            elif 'Full Match' not in mr_list and "Only Location" in mr:
                CD_table.loc[ind, 'Search Name'] = n
                match = True
            elif ('Full Match' not in mr_list and "Only Location" not in mr_list) and "Too Many" in mr:
                CD_table.loc[ind, 'Search Name'] = n
                match = True

In [26]:
# Manually remove names for when length of the name list is 4
ind = CD_table[CD_table['Search Names'].apply(lambda x: str(x) == "['Catherine Coleman', 'Catharine Coleman', 'William Coleman', 'Jacob Coleman']")].index
CD_table.loc[ind, 'Search Names'] = CD_table.loc[ind, 'Search Names'].apply(lambda x: ['Catherine Coleman', 'William Coleman', 'Jacob Coleman'])

In [27]:
# processing names where search names length is 4
name_length = 3
# processing search names length of 2
names = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Names']
locations = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Raw Location']
namedict = dict(zip([str(n) for n in names], names))
locationdict = dict(zip([str(n) for n in names], locations))

remlist = []
for name_str, name in namedict.items():
    match = False
    ind = CD_table[[str(sn) == name_str and loc == locationdict[name_str] for sn, loc in zip(CD_table['Search Names'], CD_table['Raw Location'])]].index
    mr_list = CD_table.loc[ind, 'Match Reason'].tolist()[0]
    for n in name:
        if not match:
            mr = df[[sn == n and loc == locationdict[name_str] for sn, loc in zip(df['Search Name'], df['Combined Location'])]]['Match Reason'].tolist()[0]
            if mr == 'Full Match':
                CD_table.loc[ind, 'Search Name'] = n
                match = True
            elif 'Full Match' not in mr_list and "Only Location" in mr:
                CD_table.loc[ind, 'Search Name'] = n
                match = True
            elif ('Full Match' not in mr_list and "Only Location" not in mr_list) and "Too Many" in mr:
                CD_table.loc[ind, 'Search Name'] = n
                match = True
    if not match:
        n = max(name, key = len)
        CD_table.loc[ind, 'Search Name'] = n

In [28]:
scores = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Names'].apply(lambda x: [fuzz.ratio(x[0], x[1]),
                                                                                                                    fuzz.ratio(x[0], x[2]),
                                                                                                                    fuzz.ratio(x[1], x[2])])
score_names = pd.concat([scores.apply(lambda x: [round(ele, 2) for ele in x]),
                         CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Names'],
                         CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Name']], axis = 1)
score_names['Search Name2'] = CD_table['Search Name']
score_names.columns = ['Scores', 'Search Names', 'Search Name', 'Search Name2']

# classify names where all 3 are different
nosame = score_names[score_names['Scores'].apply(lambda x: all([ele<80 for ele in x]))]
# nosame
# manually correct some names classified as different
score_names.loc[score_names[score_names['Search Name'] == 'Charles Lee Virginia'].index,
                ['Search Names', 'Search Name']] = [['Charles Lee', 'Richard Lee'], 'Charles Lee']
score_names.loc[score_names[score_names['Search Name'] == 'Ignatius Smith'].index, 'Search Names'] = [['Ignatius Smith', 'James Smith']]
score_names.loc[score_names[score_names['Search Name'] == 'Trustees Of The Presbyterean Church Warwick Township Bucks'].index, 'Search Names'] = [['Trustees Of The Presbyterean Church Warwick Township Bucks'], ['Trustees Of The Presbyterean Church Warwick Township Bucks']]

somesame = score_names[score_names['Scores'].apply(lambda x: not all([ele<80 for ele in x]))]
#somesame
score_names.loc[score_names[score_names['Search Name'] == 'Asa Gillet'].index,
                'Search Names'] = score_names.loc[score_names[score_names['Search Name'] == 'Asa Gillet'].index,
                                                    'Search Names'].apply(lambda x: ['Asa Gillet', 'Israel Gillet'])
score_names.loc[score_names[score_names['Search Name'] == 'Gasaway Watkins'].index, 'Search Names'] = score_names.loc[score_names[score_names['Search Name'] == 'Gasaway Watkins'].index, 'Search Names'].apply(lambda x: ['Gasaway Watkins', 'Tristram Bowdle'])
score_names.loc[score_names[score_names['Search Name'] == 'William Johnston'].index, 'Search Names'] = score_names.loc[score_names[score_names['Search Name'] == 'William Johnston'].index, 'Search Names'].apply(lambda x: ['William Waterman', 'William Johnston'])

# fix allsame
allsame_ind = score_names[[not all([ele<80 for ele in s]) and
                           sn not in ['Asa Gillet', 'Gasaway Watkins', 'William Johnston'] for s, sn in zip(score_names['Scores'],
                                                                                                            score_names['Search Name'])]].index
score_names.loc[allsame_ind, 'Search Names'] = [[x] for x in score_names.loc[allsame_ind, 'Search Name']]

  return array(a, dtype, copy=False, order=order)


In [29]:
CD_table.loc[score_names.index, ['Search Names', 'Search Name']] = score_names[['Search Names', 'Search Name']].values

  return array(a, dtype, copy=False, order=order)


In [30]:
name_length = 2
# processing search names length of 2
names = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Search Names']
scores = CD_table.loc[CD_table[CD_table['Search Names'].apply(lambda x: len(x)>1 and len(x) < 3)].index,
                      'Search Names'].apply(lambda x: fuzz.ratio(x[0], x[1]))
compare = pd.DataFrame([names, scores]).T
compare.columns = ['Search Names', 'Scores']
# processing search names length of 2
locations = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == name_length)]['Raw Location']
namedict = dict(zip([str(n) + "_" + l for n, l in zip(names, locations)], names))

remlist = []
for name_str_combo, name in namedict.items():
    match = False
    ind = CD_table[[(str(sn) + "_" + loc) == name_str_combo for sn, loc in zip(CD_table['Search Names'], CD_table['Raw Location'])]].index
    mr_list = CD_table.loc[ind, 'Match Reason'].tolist()[0]
    for n in name:
        if not match:
            mr = df[[sn == n and name_str_combo.split("_")[1] == loc for sn, loc in zip(df['Search Name'], df['Combined Location'])]]['Match Reason'].tolist()[0]
            if mr == 'Full Match':
                CD_table.loc[ind, 'Search Name'] = n
                match = True

            elif 'Full Match' not in mr_list and "Only Location" in mr:
                CD_table.loc[ind, 'Search Name'] = n
                match = True

            elif ('Full Match' not in mr_list and "Only Location" not in mr_list) and "Too Many" in mr:
                CD_table.loc[ind, 'Search Name'] = n
                match = True

    if not match:
        n = max(name, key = len)
        CD_table.loc[ind, 'Search Name'] = n

In [31]:
# replace search names with just a list with one name
candidate_index = CD_table[[len(sn) == 2 and len(mr) == 1 for sn, mr in zip(CD_table['Search Names'], CD_table['Match Reason'])]].index
snames = compare[compare['Search Names'].apply(lambda x: x in CD_table.loc[candidate_index]['Search Names'].tolist())].query('Scores > 73 and Scores != 75')['Search Names'].tolist()
repindex = CD_table.loc[candidate_index][CD_table.loc[candidate_index, 'Search Names'].apply(lambda x: x in snames)].index
CD_table.loc[repindex, 'Search Names'] = CD_table.loc[repindex, 'Search Name'].apply(lambda x: [x])

In [32]:
# set search names to the name for everyone else
goodnames = CD_table[CD_table['Search Names'].apply(lambda x: len(x) == 1)].index
CD_table.loc[goodnames, 'Search Name'] = \
    CD_table.loc[goodnames, 'Search Names'].apply(lambda x: x[0])

## Turn list format into normal datatypes

In [33]:
datacols = ['Family Size', 'Location', 'Match Reason', 'Match Status', 'Slavecount', 'url']
for ind in CD_table['Search Name'].index:
    search_name = CD_table.loc[ind, 'Search Name']
    loc = CD_table.loc[ind, 'Raw Location']
    search_namelist = CD_table.loc[ind, 'Search Names']
    if len(search_namelist) > 1:
        CD_table.loc[ind, datacols] = df[[sn == search_name and rl == loc for sn, rl in zip(df['Search Name'],
                                                                                            df['Combined Location'])]][datacols].drop_duplicates().values[0].tolist()
    else:
        for col in datacols:
            CD_table.loc[ind, col] = CD_table.loc[ind, col][0]

In [34]:
# add original names
t = df[['Search Name', 'Combined Location', 'Original Name']].fillna('').groupby(['Search Name', 'Combined Location']).agg({'Original Name':list}).reset_index()
t['key'] = t['Search Name'] + "_" + t['Combined Location']
ogname_dict = dict(zip(t['key'], t['Original Name']))

CD_table['Original Names'] = CD_table.apply(lambda x: [ogname_dict.get(n+"_"+ (x['Location'] if not pd.isnull(x['Location']) else ''), ogname_dict.get(n+"_"+ (x['Raw Location'] if not pd.isnull(x['Raw Location']) else ''))) if n != '' else '' for n in x['Search Names']], axis = 1)

CD_table['Original Names'] = CD_table['Original Names']
CD_table['Match Reason'] = CD_table['Match Reason'].apply(lambda x: x if 'Too Many' not in x else "Too Many Potential Matches Found (" + str(x.split("Found ")[1]) + ")")

In [35]:
multiple_ind = CD_table[['Too Many' in mr and count > 1 and len(snames) == 1 for mr, count, snames in zip(CD_table['Match Reason'],
                                                                                                          CD_table['count'],
                                                                                                          CD_table['Search Names'])]].index
CD_table.loc[multiple_ind, 'notes'] = 'Potentially Multiple People'

multiple_ind = CD_table[CD_table['Search Names'].apply(lambda x: len(x) > 1)].index
CD_table.loc[multiple_ind, 'notes'] = 'See Supplementary Table'

In [36]:
# define location column
# make some manual corrections
CD_table['Location'] = [max([x for x in [l, rl] if not pd.isnull(x)], key = len) for l, rl in zip(CD_table['Location'], CD_table['Raw Location'])]
CD_table.loc[0:2, 'Original Search Names'] = CD_table.loc[0:2, 'Original Search Names'].apply(lambda x: [''])

In [37]:
CD_table['Original Occupations'] = [[x for x in (occ + title) if not pd.isnull(x) and x != ''] for occ, title in zip(CD_table['Occupation'], CD_table['Title'])]

## Merge with New York Data

In [38]:
NY_table = pd.read_csv('../../Data/Post1790/Aggregated/NY/NY_table.csv', index_col = 0)

In [39]:
NY_table['Search Names'] = NY_table['Search Names'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))
NY_table['Original Names'] = NY_table['Original Names'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))

## Create Supplementary Data

In [40]:
# create table with supplementary data
CD_table2 = CD_table.copy()
CD_table2['Search Names2'] = CD_table2['Search Names']
CD_table2 = CD_table2[['Search Names', 'Original Names', 'Search Names2', 'Search Name', 'Raw Location', 'count', '6p_Total', '6p_def_Total', '3p_Total', 'Original Occupations']].explode('Search Names2')
CD_table2 = CD_table2[CD_table2['Search Names2'] != CD_table2['Search Name']]
CD_table2.drop('Search Name', axis = 1, inplace = True)
CD_table2.rename({'Search Names2':'Search Name'}, axis = 1, inplace = True)
CD_table2.reset_index(drop = True, inplace = True)

In [41]:
# reformat match reason column
# add data to CD_table2 - suppelementary table
for ind in CD_table2.index:
    search_name = CD_table2.loc[ind, 'Search Name']
    search_namelist = CD_table2.loc[ind, 'Search Names']
    loc = CD_table2.loc[ind, 'Raw Location']
    CD_table2.loc[ind, datacols] =  df[[sn == search_name and rl == loc for sn, rl in zip(df['Search Name'],
                                                                                          df['Combined Location'])]][datacols].drop_duplicates().values[0].tolist()
CD_table2['Match Reason'] = CD_table2['Match Reason'].apply(lambda x: x if 'Too Many' not in x else "Too Many Potential Matches Found (" + str(x.split("Found ")[1]) + ")")

In [42]:
# adding how many people were actually searched for - # unique people
CD_table2['temp'] = CD_table2['Search Names'].astype(str)
namecntdf = CD_table2.groupby('temp')['Search Names'].count().reset_index()
namecntdict = dict(zip(namecntdf['temp'], namecntdf['Search Names']))
CD_table['temp'] = CD_table['Search Names'].astype(str)
CD_table2['# debtholders'] = CD_table2['temp'].apply(lambda x: namecntdict.get(x, 0)+1)
CD_table['# debtholders'] = CD_table['temp'].apply(lambda x: namecntdict.get(x, 0)+1)

In [43]:
CD_table2['Location'] = [l if not pd.isnull(l) else rl for l, rl in zip(CD_table2['Location'], CD_table2['Raw Location'])]

In [44]:
# reodder columns
CD_table = CD_table[['Search Names', 'Search Name', 'Original Names', 'Original Occupations', '# debtholders', 'count', '6p_Total', '6p_def_Total', '3p_Total',
                     'Location', 'Family Size', 'Slavecount', 'Match Status', 'Match Reason', 'notes', 'url']]
CD_table2 = CD_table2[['Search Names', 'Search Name', 'Original Names', 'Original Occupations',  '# debtholders', 'count', '6p_Total', '6p_def_Total', '3p_Total',
                       'Location', 'Family Size', 'Slavecount', 'Match Status', 'Match Reason', 'url']]

In [45]:
NY_table2 = pd.read_csv('../../Data/Post1790/Aggregated/NY/NY_supp_table.csv', index_col = 0)

In [46]:
NY_table2['Search Names'] = NY_table2['Search Names'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))
NY_table2['Original Names'] = NY_table2['Original Names'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))

In [47]:
# no need to combine NY_table2 and CD_table2
CD_table2[CD_table2['Search Name'].apply(lambda x: x in NY_table2['Search Name'].tolist())]
CD_table2['origin'] = 'Aggregated CD debt files'
NY_table2['origin'] = 'NY debt file'
CD_table2 = pd.concat([CD_table2, NY_table2])

In [48]:
CD_table['origin'] = 'Aggregated CD debt files'
NY_table['origin'] = 'NY debt file'
CD_table = pd.concat([CD_table, NY_table])

In [49]:
CD_table.reset_index(drop = True, inplace = True)

In [50]:
CD_table.loc[CD_table[CD_table['Location'].apply(lambda x: pd.isnull(x))].index, 'Location'] = 'New York'

## Fix Location Names

In [51]:
agg_names = CD_table[CD_table['Location'].apply(lambda x: 'New York' in x if not pd.isnull(x) else False)]['Search Names'].value_counts()[CD_table[CD_table['Location'].apply(lambda x: 'New York' in x if not pd.isnull(x) else False)]['Search Names'].value_counts() > 1].index

In [52]:
for name in agg_names:
    try:
        ind1 = CD_table[CD_table['Search Names'].apply(lambda x: x == name)].query("'New York' in Location").index[0]
    except:
        continue
    drops = CD_table[CD_table['Search Names'].apply(lambda x: x == name)].query("'New York' in Location").index[:-1]
    if len(drops) != 0:
        for col in ['6p_Total', '6p_def_Total', 'count', '3p_Total', 'origin']:
            if col != 'origin':
                val = CD_table[CD_table['Search Names'].apply(lambda x: x == name)].query("'New York' in Location")[col].sum()
                CD_table.loc[ind1, col] = val
            else:
                CD_table.loc[ind1, col] = 'Both Data Sources'
        CD_table.drop(drops, inplace = True)

## Fix Occupation Names

In [53]:
occ_list = list(set(CD_table['Original Occupations'].explode().tolist()))
occ_list.remove(np.nan)

In [54]:
# add to occupation dictionary
adm_list = [x for x in occ_list if 'adm' in x or 'Adm' in x]
for occ in adm_list:
    occ_dict[occ] = 'administrator'
ex_list = [x for x in occ_list if 'exe' in x.lower()]
for occ in ex_list:
    occ_dict[occ] = 'executor'
gentleman_list = ['G. Man', 'Genl Man','Genl man','Gent.', 'Gent. in the Court','Gentn']
gentlewoman_list = ['G. Woman','Gen. Woman', 'Genl Woman', 'Genl woman', 'Gent. w.',
                    'Gentle Wm','Gentlewoman','Gentlewomen','Gentw']
for occ in gentleman_list:
    occ_dict[occ] = 'gentleman'
for occ in gentlewoman_list:
    occ_dict[occ] = 'gentlewoman'
merchant_list = ['Merch.', 'Mercht','Mert', 'Mert.']
for occ in merchant_list:
    occ_dict[occ] = 'merchant'
behalf_list = ['Deceased Estate of','Estate William Butler','Estate of Philip Boehm Deceased','Ex to the Estate Christopher Grover',
               'For the Estate Abraham Robinson', 'His estate','In Trust for the use of the Heris of Samuel McFarran Deceased',
               'Trustee', 'Trustee for the Use of the Legal Representatives of Thomas Albertson Deceased',
               'Trustee to Geo and William Peck','for the Estate of William Allison deceased', 'of Jas',
               'of Nicholas']
guard_list = [x for x in occ_list if 'guard' in x.lower()]
for occ in guard_list:
    occ_dict[occ] = 'guardian'
for occ in behalf_list:
    occ_dict[occ] = 'trustee'

occ_og = ['Acct',  'At State House', 'Attory at Law','Attory','B.Smith', 'Bl. Smith','Board Buildr', 'Book binder',
          'Carpentr', 'Carpr', 'Clk. Market', 'Coach Maker', 'Colonel', 'Consul of the French Nation',
          'Currier','House Carp.','House Carpenter','Brees maker', 'Inn keeper', 'Iron Monger', 'Mill S. Makr',
          'Minister Luth. Cong.', 'Ministers their Widows and Children',
          'Rigger', 'School Mastr', 'Ships Their Widows and Children', 'Shoe Mak.',
          'Shoe Marker', 'Shop Kr',  'Shop keeper',  'Skinner', 'Sugar Baker', 'Surveyor',
          'Tallow Chanr', 'Tavern k.', 'Wd',
          'Plane Maker', 'Printer & Bookseller', 'Professor in the University', 'Providence',
          'Notary, Scrivener, & Broker','Governor of the State',
          'Boardg School', 'Boardg hous', 'City Commr', 'Clerk in Rec. Genl of Land Office', 'Grocer',
          'Lab', 'Negroe', 'Plaisterer', 'Receiver Gen. of Land Office', 'junior']
occ_new = ['accountant', 'doorkeeper', 'attorney','attorney', 'blacksmith', 'blacksmith', 'board builder', 'book binder',
           'carpenter', 'carpenter', 'cloak maker', 'coach maker', 'colonel', 'consul',
           'currier','carpenter', 'carpenter', 'baker', 'inn keeper', 'ironmonger', 'miller',
           'minister','minister',
           'rigger', 'school master', np.nan, 'shoemaker',
           'shoemaker', 'shopkeeper', 'shopkeeper', 'skinner', 'baker', 'surveyor',
           'candlekeeper', 'inn keeper', 'widow',
           'wooden planemaker', 'printer', 'professor', np.nan,
           ['notary', 'scrivener', 'broker'], 'governor',
           'boardinghouse keeper', 'boardinghouse keeper', 'city commissioner', 'clerk', 'grocer',
           np.nan, 'negroe', 'plasterer','receiver general of land office', np.nan]
for og, new in zip(occ_og, occ_new):
    occ_dict[og] = new

In [55]:
for key, value in occ_dict.items():
    if value == 'marchant':
        occ_dict[key] = 'merchant'

In [56]:
# proess occupations
CD_table['Original Occupations'] = CD_table['Original Occupations'].apply(lambda x: [] if type(x) != list else x)
CD_table['Original Occupations'] = CD_table['Original Occupations'].apply(lambda x: list(set(x)))
CD_table['Original Occupations'] = CD_table['Original Occupations'].apply(lambda x: [] if type(x) != list else x)
CD_table['Occupations'] = CD_table['Original Occupations'].apply(lambda x: [] if type(x) != list else x)
CD_table['Occupations'] = CD_table['Occupations'].apply(lambda x:  [occ_dict.get(ele, ele) for ele in x])
CD_table['Occupations'] = CD_table['Occupations'].apply(lambda x:  x if (x != ['broker', ['notary', 'scrivener', 'broker']] and x != [['notary', 'scrivener', 'broker'], 'broker']) else ['broker',' notary', 'scrivener'])
CD_table['Occupations'] = CD_table['Occupations'].apply(lambda x:  x if x != ['no occupation'] else [])
CD_table['Occupations'] = CD_table['Occupations'].apply(lambda x:  x if np.nan not in x else x.remove(np.nan))
CD_table['Occupations'] = CD_table['Occupations'].apply(lambda x: [] if type(x) != list else x)
CD_table['Occupations'] = CD_table['Occupations'].apply(lambda x: list(set(x)))

In [57]:
occ_df = pd.DataFrame([occ_dict.keys(), occ_dict.values()]).T
occ_df.columns = ['original name', 'new name']

In [58]:
cnty_cw = pd.read_csv('../../Data/AssetGeography/final_geographical_cw.csv', index_col = 0)

locdict = dict()
for location in CD_table[CD_table['Location'].apply(lambda x: len(x.split(", ")) == 1 and x.split(", ")[0].strip() not in list(statedict.values()))]['Location'].unique():
    loc = cnty_cw[cnty_cw['county'] == (location.strip() + ' County')][['county', 'state']].drop_duplicates().values.tolist()
    if len(loc)== 0:
        continue
    else:
        locdict[location] = loc[0][0].replace("County", "").strip() + ", " + statedict[loc[0][1]]
for location in CD_table[CD_table['Location'].apply(lambda x: len(x.split(", ")) == 1 and x.split(", ")[0].strip() not in list(statedict.values()))]['Location'].unique():
    loc = cnty_cw[cnty_cw['town'] == (location.strip())][['town', 'county', 'state']].drop_duplicates().values.tolist()
    if len(loc)== 0:
        continue
    else:
        locdict[location] = loc[0][0] + ", " + loc[0][1].replace("County", "").strip() + ", " + statedict[loc[0][2]]
for loc in ['Dutchess', 'Ulster', 'Kings', 'Queens', 'Westchester']:
    locdict[loc] = loc + ', New York'

## Fix Location Names Again lol

In [59]:
CD_table['Location'] = CD_table['Location'].apply(lambda x: locdict.get(x.strip(), x))
CD_table['Location'] = CD_table['Location'].apply(lambda x: locdict.get(x, x))

In [60]:
CD_table.loc[CD_table[CD_table['Search Name'] == 'Hopson Pinckney'].index, 'Location'] = 'Charleston, South Carolina'

In [61]:
locdict = dict(zip(['GB', 'BVI', 'BM', 'VI', 'FR'],['Great Britain', 'British Virgin Islands', 'Bermuda', 'British Virgin Islands', 'France']))

for ind in CD_table[CD_table['Location'] == 'United States']['Search Name'].index:
    name = CD_table.loc[ind, 'Search Name']
    if name != 'Joseph Norse':
        town = df[df['Search Name'] == name]['town'].tolist()[0]
        nation = df[df['Search Name'] == name]['state'].tolist()[0]
        if locdict[nation] == town:
            CD_table.loc[ind, 'Location'] = locdict[nation]
        else:
            CD_table.loc[ind, 'Location'] = town + ", " + locdict[nation]

In [62]:
CD_table['Slavecount'] = CD_table.apply(lambda x: 0 if pd.isnull(x['Slavecount']) and not pd.isnull(x['Family Size']) else x['Slavecount'], axis = 1)

In [63]:
CD_table2['Slavecount'] = CD_table2.apply(lambda x: 0 if pd.isnull(x['Slavecount']) and not pd.isnull(x['Family Size']) else x['Slavecount'], axis = 1)

In [64]:
occ_df.to_csv('../../Data/Post1790/Aggregated/occupation/occupation_mapping.csv')

In [65]:
CD_table[['Search Names', 'Location']] = CD_table[['Search Names', 'Location']].astype(str)
CD_table_final = CD_table.groupby(['Search Names', 'Location']).agg({'Search Name':list, 'Original Names':list, 'Original Occupations':list,
                                                                     '# debtholders': list, 'count': sum, '6p_Total': sum, '6p_def_Total': sum,
                                                                     '3p_Total': sum, 'Family Size': list, 'Slavecount': list,
                                                                     'Match Status': list, 'Match Reason': list, 'notes': list,
                                                                     'url': list, 'origin': list, 'Occupations': list}).reset_index()

In [66]:
CD_table_final['Search Names'] = CD_table_final['Search Names'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))
for col in ['Search Name', 'Original Names', 'Original Occupations', '# debtholders', 'Family Size', 'Slavecount',
            'Match Status', 'Match Reason', 'notes', 'url', 'origin', 'Occupations']:
    CD_table_final[col] = CD_table_final[col].apply(lambda x: x[0])

In [67]:
CD_table2[['Search Names', 'Location']] = CD_table2[['Search Names', 'Location']].astype(str)
CD_table2_final = CD_table2.groupby(['Search Names', 'Location']).agg({'Search Name':list, 'Original Names':list, 'Original Occupations':list,
                                                                     '# debtholders': list, 'count': sum, '6p_Total': sum, '6p_def_Total': sum,
                                                                     '3p_Total': sum, 'Family Size': list, 'Slavecount': list,
                                                                     'Match Status': list, 'Match Reason': list,
                                                                     'url': list, 'origin': list}).reset_index()

In [68]:
CD_table2_final['Search Names'] = CD_table2_final['Search Names'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))
for col in ['Search Name', 'Original Names', 'Original Occupations', '# debtholders', 'Family Size', 'Slavecount',
            'Match Status', 'Match Reason', 'url', 'origin']:
    CD_table2_final[col] = CD_table2_final[col].apply(lambda x: x[0])

In [69]:
CD_table_final['Location'] = CD_table_final['Location'].replace('nan', np.nan)
CD_table2_final['Location'] = CD_table2_final['Location'].replace('nan', np.nan)

In [70]:
CD_table_final.sort_values(['6p_Total', '3p_Total'], ascending = False).reset_index(drop = True).to_csv('../Results/CD_table.csv')

In [71]:
CD_table2_final.sort_values(['6p_Total', '3p_Total'], ascending = False).reset_index(drop = True).to_csv('../Results/CD_supp_table.csv')