In [313]:
import csv
import pandas as pd
import numpy as np
import itertools
from rapidfuzz import fuzz


## Create Final Table

In [314]:
def deNaN(series):
    return series.apply(lambda x: "" if pd.isnull(x) else x)

In [315]:
df = pd.read_csv('NY_results.csv', index_col = 0)

# preprocess name columns
df['Search Name'] = df['Search Name'].apply(lambda x: x.replace('[[', '').replace(']]',''))
df['Search Name'] = df['Search Name'].apply(lambda x: x.split('a0')[0][2:-2])
df['Original Name'] = df['Original Name'].apply(lambda x: x.replace('\'','').replace('\"','').strip('][').split(', '))
df['Original Name2'] = df['Original Name'].apply(lambda x: str(x))

# preprocess slavecount oclumns
df.loc[df[df['Slavecount'].apply(lambda x: '\xa0' in x if not pd.isnull(x) else False)].index, 'Slavecount'] = np.nan

In [316]:
# dictionary to convert between string and list version of data
str_convert = dict(zip(df['Original Name2'], df['Original Name']))
# find unique value for all of the columns - helps with consolidation
# use dict.fromkeys because it preserves order
df_reformat = df.groupby(['Index', 'Original Name2'])['Search Name'].unique().reset_index()
for col in ['Match Status', 'Match Reason', 'Location', 'Family Size', 'Slavecount', 'url']:
    df_merge = df.groupby(['Index', 'Original Name2']).agg({col:list}).reset_index()
    df_merge[col] = df_merge[col].apply(lambda x: list(dict.fromkeys(x)))
    df_reformat = pd.merge(df_reformat, df_merge)
# convert back to original format
df_reformat['Original Name'] = df_reformat['Original Name2'].apply(lambda x: str_convert[x])
# preprocess names to prepare for merging
df_reformat['Original Name2'] = df_reformat['Original Name'].apply(lambda x: set([y.replace('\"', '') for y in x]))
df_reformat['Original Name2'] = df_reformat['Original Name2'].apply(lambda x: str(sorted(x)))

In [317]:
# NY loan data
NY_CD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_CD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
# create name column
NY_CD_raw['Name 1'] =  deNaN(NY_CD_raw['First Name']) + " " + deNaN(NY_CD_raw['Last Name'])
NY_CD_raw['Name 2'] =  deNaN(NY_CD_raw['First Name.1']) + " " + deNaN(NY_CD_raw['Last Name.1'])
NY_CD_raw['Name 3'] =  deNaN(NY_CD_raw['First Name.2']) + " " + deNaN(NY_CD_raw['Last Name.2'])
NY_CD_raw['Name'] = [set([x.replace("  "," ").strip() for x in [name1, name2, name3] if x.strip() != ""]) for name1, name2, name3 in zip(NY_CD_raw['Name 1'], NY_CD_raw['Name 2'], NY_CD_raw['Name 3'])]
# create name column string equivalent and dictionary to convert
NY_CD_raw['Name_str'] = NY_CD_raw['Name'].apply(lambda x: str(sorted(x)))
NY_dict = dict(zip(NY_CD_raw['Name_str'], NY_CD_raw['Name']))

# aggregate our reults - creat eaggregated dataframe
NY_CD_raw['6p_Total'] = NY_CD_raw['6p_Dollar'].fillna(0) + NY_CD_raw['6p_Cents'].fillna(0)
NY_CD_raw['6p_def_Total'] = NY_CD_raw['6p_def_Dollar'].fillna(0) + NY_CD_raw['6p_def_Cents'].fillna(0)
NY_CD_raw['3p_Total'] = NY_CD_raw['3p_Dollar'].fillna(0) + NY_CD_raw['3p_Cents'].fillna(0)
NY_grouped = NY_CD_raw.groupby('Name_str').agg({'6p_Total': ['sum', 'count'],
                                                '6p_def_Total': 'sum',
                                                '3p_Total': 'sum'}).reset_index()
NY_grouped.columns = ['Name_str', '6p_Total', 'count', '6p_def_Total', '3p_Total']


In [318]:
# merge NY asset data and scraped data
NY_final = pd.merge(NY_grouped, df_reformat, left_on='Name_str', right_on = 'Original Name2')
NY_final['Original Name'] = NY_final['Original Name2'].apply(lambda x: NY_dict[x])
NY_final.drop(['Original Name2'], axis = 1, inplace = True)

# re-string Search Name Column
NY_final['Name_str'] = NY_final['Search Name'].apply(lambda x: str(sorted(x)))

In [319]:
# names that we will match
sn_series = NY_final['Search Name'].explode().drop_duplicates()
sn_names = [sn for sn in NY_final['Search Name'].tolist() if len(sn) < 2]
sn_series = [x for x in sn_series[sn_series.apply(lambda x: [x] in sn_names)]]

In [320]:
# use fuzzy string matching to see if the same name appears multiple times spelled slightly differently
elements = sn_series
results = [[name, [], 0] for name in elements]

for (i, element) in enumerate(elements):
    for (j, choice) in enumerate(elements[i+1:]):
        if fuzz.ratio(element, choice, score_cutoff=85):
            results[i][2] += 1
            results[i][1].append(choice)
            results[j+i+1][2] += 1
            results[j+i+1][1].append(element)
# remove names with no matches
match_df = pd.DataFrame(results, columns=['name', 'duplicates', 'duplicate_count'])
match_df['dup_list'] = [[name] + dup for name, dup in zip(match_df['name'], match_df['duplicates'])]
match_df['dup_list_str'] = match_df['dup_list'].apply(lambda x: str(sorted(x)))
match_df = match_df[match_df['duplicate_count'] > 0]
# create duplicate list
# convert to string format, create match reason column
mr_dict = dict(zip(NY_final[NY_final['Search Name'].apply(lambda x: len(x) == 1)]['Search Name'].apply(lambda x: x[0]),
                   NY_final[NY_final['Search Name'].apply(lambda x: len(x) == 1)]['Match Reason']))
match_df['match reason'] = match_df['name'].apply(lambda x: mr_dict[x][0])

In [321]:
# see whether two matches are actually a match, using census data
grouped_df = match_df.groupby(['dup_list_str'])['match reason'].unique().reset_index()
grouped_df = grouped_df[grouped_df['match reason'].apply(lambda x: len(x) == 1 and 'Too Many Potential Matches' not in x[0])]
# list to manually remove certain matches
remlist = []
final_merge = pd.merge(match_df.drop('match reason', axis = 1),grouped_df[grouped_df['match reason'].apply(lambda x: len(x) == 1)], on = 'dup_list_str')
final_merge = final_merge[final_merge['dup_list'].apply(lambda x: (x not in remlist))]
fuzzy_merge_dict = dict(zip(final_merge['name'].apply(lambda x: str([x])),
                            final_merge['dup_list_str'].astype(str)))
# add in merged data
NY_final['Name_str_new'] = NY_final['Name_str'].apply(lambda x: fuzzy_merge_dict.get(x, x))

In [322]:
NY_grouped = NY_final.groupby('Name_str_new').agg({'count':'sum', '6p_Total':'sum', '6p_def_Total':'sum', '3p_Total':'sum',
                                                   'Family Size': list,  'Location': list, 'Match Reason': list,
                                                   'Match Status': list, 'Slavecount': list, 'url': list,
                                                   'Name_str': list}).reset_index()
# remove duplicates, flatten string
for cols in ['Family Size', 'Location', 'Match Reason', 'Match Status', 'Slavecount', 'url']:
    NY_grouped[cols] = NY_grouped[cols].apply(lambda x: list(dict.fromkeys(list(itertools.chain(*x)))))

# change to list format
namestr_dict = dict(zip(NY_final['Name_str'], NY_final['Search Name']))
NY_grouped['Name_str'] = NY_grouped['Name_str'].apply(lambda x: list(dict.fromkeys(list(itertools.chain(*[namestr_dict[ele] for ele in x])))))
NY_grouped.rename({'Name_str': 'Original Search Names'}, axis = 1, inplace = True)

# aggregate data - group by search name
NY_grouped['Name_str_new'] = NY_grouped['Original Search Names'].apply(lambda x: [max(x, key=len)] if x in final_merge['dup_list'].tolist() else x)
NY_grouped.rename({'Name_str_new': 'Search Names'}, axis = 1, inplace = True)

In [323]:
# processing names where search names length is 3
NY_table = NY_grouped.copy()
# processing search names length of 3
names  = NY_table.loc[NY_table[NY_table['Search Names'].apply(lambda x: len(x)>2)].index, 'Search Names'].apply(lambda x: max(x, key = len)).tolist()
NY_table.loc[NY_table[NY_table['Search Names'].apply(lambda x: len(x)>2)].index, 'Search Name'] = names
NY_table.loc[NY_table[NY_table['Search Names'].apply(lambda x: len(x)>2)].index, 'Search Names'] = NY_table.loc[NY_table[NY_table['Search Names'].apply(lambda x: len(x)>2)].index, 'Search Names'].apply(lambda x: [max(x, key = len)])

In [324]:
# processing search names length of 2
snames = NY_table.loc[NY_table[NY_table['Search Names'].apply(lambda x: len(x)>1 and len(x) < 3)].index,
                            'Search Names']
scores = NY_table.loc[NY_table[NY_table['Search Names'].apply(lambda x: len(x)>1 and len(x) < 3)].index,
                            'Search Names'].apply(lambda x: fuzz.ratio(x[0], x[1]))
compare = pd.DataFrame([snames, scores]).T
compare.columns = ['Search Names', 'Scores']

names = [names for names in compare['Search Names']]
namedict = dict(zip([str(n) for n in names], names))
remlist = []
for name_str, name in namedict.items():
    ind = NY_table[NY_table['Search Names'].apply(lambda x: str(x) == name_str)].index
    mr = NY_table.loc[ind, 'Match Reason'].tolist()[0]
    for code in ['Full Match', 'Too Many Potential Matches Found', 'No Match Found']:
        if len(mr) == 1 and code in mr[0]:
            ind = NY_table[NY_table['Search Names'].apply(lambda x: all(x == name) if type(x == name) != bool else x == name)].index
            NY_table.loc[ind, 'Search Name'] = max(name, key = len)
            remlist.append(name_str)
    for code in ['Full Match', 'Only Location Found', 'Too Many Potential Matches Found', 'No Match Found']:
        if len(mr) == 2 and (code in mr[0] or mr[1]) and name_str not in remlist:
            ind = NY_table[NY_table['Search Names'].apply(lambda x: all(x == name) if type(x == name) != bool else x == name)].index
            mr1 = df[df['Search Name'] == name[0]]['Match Reason'].tolist()[0]
            mr2 = df[df['Search Name'] == name[1]]['Match Reason'].tolist()[0]
            name = name[0] if mr1 == 'Full Match' else name[1]
            NY_table.loc[ind, 'Search Name'] = name
            remlist.append(name_str)
for n in remlist:
    del namedict[n]

In [325]:
# replace search names with just a list with one name
candidate_index = NY_table[[len(sn) == 2 and len(mr) == 1 for sn, mr in zip(NY_table['Search Names'], NY_table['Match Reason'])]].index
snames = compare[compare['Search Names'].apply(lambda x: x in NY_table.loc[candidate_index]['Search Names'].tolist())].query('Scores > 80')['Search Names'].tolist()
repindex = NY_table.loc[candidate_index][NY_table.loc[candidate_index, 'Search Names'].apply(lambda x: x in snames)].index
NY_table.loc[repindex, 'Search Names'] = NY_table.loc[repindex, 'Search Name']

In [327]:
# set search names to the name for everyone else
goodnames = NY_table[NY_table['Search Names'].apply(lambda x: len(x) == 1)].index
NY_table.loc[goodnames, 'Search Name'] = \
    NY_table.loc[goodnames, 'Search Names'].apply(lambda x: x[0])

In [328]:
datacols = ['Family Size', 'Location', 'Match Reason', 'Match Status', 'Slavecount', 'url']
for ind in NY_table['Search Name'].index:
    search_name = NY_table.loc[ind, 'Search Name']
    search_namelist = NY_table.loc[ind, 'Search Names']
    if len(search_namelist) > 1:
        try:
            NY_table.loc[ind, datacols] = df[df['Search Name'] == search_name][datacols].drop_duplicates().squeeze().tolist()
        except:
            continue
    else:
        for col in datacols:
            NY_table.loc[ind, col] = NY_table.loc[ind, col][0]

Table 1: Indexed by Original Name List
Original Name List created by mapping search names list to Original Names in df
Other identifying columns are Search Names and Search Name columns
Also delist the other columns with relevant information
if Match Reason (before cleaning) is only "too many potential matches found" - flag as potentially multiple people

Table 2: Contains data from df for information on other people in Search Names list from table 1 but aren't Search Name
Indexed by Search Names and Name

In [338]:
# add original names
ogname_dict = dict(df[['Search Name', 'Original Name']].groupby('Search Name').agg({'Original Name':list}).reset_index().values)
NY_table['Original Names'] = NY_table['Original Search Names'].apply(lambda x: [ogname_dict[n][0] for n in x])
NY_table['Original Names'] = NY_table['Original Names'].apply(lambda x: list(dict.fromkeys(list(itertools.chain(*x)))))
NY_table['Match Reason'] = NY_table['Match Reason'].apply(lambda x: x if 'Too Many' not in x else "Too Many Potential Matches Found (" + str(x.split("Found ")[1]) + ")")

In [339]:
multiple_ind = NY_table[['Too Many' in mr and count > 1 and len(snames) == 1 for mr, count, snames in zip(NY_table['Match Reason'],
                                                                                                          NY_table['count'],
                                                                                                          NY_table['Search Names'])]].index
NY_table.loc[multiple_ind, 'notes'] = 'Potentially Multiple People'

multiple_ind = NY_table[NY_table['Search Names'].apply(lambda x: len(x) > 1)].index
NY_table.loc[multiple_ind, 'notes'] = 'See Supplementary Table'

In [340]:
# create table with supplementary data
NY_table2 = NY_table.copy()
NY_table2['Search Names2'] = NY_table2['Search Names']
NY_table2 = NY_table2[['Search Names', 'Search Names2', 'Search Name', 'Original Names', 'count', '6p_Total', '6p_def_Total', '3p_Total']].explode('Search Names2')
NY_table2 = NY_table2[NY_table2['Search Names2'] != NY_table2['Search Name']]
NY_table2.drop('Search Name', axis = 1, inplace = True)
NY_table2.rename({'Search Names2':'Search Name'}, axis = 1, inplace = True)
NY_table2.reset_index(drop = True, inplace = True)

In [341]:
# reformat match reason column
# add data to NY_table2 - suppelementary table
for ind in NY_table2.index:
    search_name = NY_table2.loc[ind, 'Search Name']
    search_namelist = NY_table2.loc[ind, 'Search Names']
    NY_table2.loc[ind, datacols] = df[df['Search Name'] == search_name][datacols].drop_duplicates().squeeze().tolist()
NY_table2['Match Reason'] = NY_table2['Match Reason'].apply(lambda x: x if 'Too Many' not in x else "Too Many Potential Matches Found (" + str(x.split("Found ")[1]) + ")")

In [342]:
# adding how many people were actually searched for - # unique people
NY_table2['temp'] = NY_table2['Search Names'].astype(str)
namecntdf = NY_table2.groupby('temp')['Search Names'].count().reset_index()
namecntdict = dict(zip(namecntdf['temp'], namecntdf['Search Names']))
NY_table['temp'] = NY_table['Search Names'].astype(str)
NY_table['# debtholders'] = NY_table['temp'].apply(lambda x: namecntdict.get(x, 0)+1)
NY_table2['# debtholders'] = NY_table2['temp'].apply(lambda x: namecntdict.get(x, 0)+1)

In [343]:
# reodder columns
NY_table = NY_table[['Search Names', 'Search Name', '# debtholders', 'Original Names', 'count', '6p_Total', '6p_def_Total', '3p_Total',
                     'Location', 'Family Size', 'Slavecount', 'Match Status', 'Match Reason', 'notes', 'url']]
NY_table2 = NY_table2[['Search Names', 'Search Name', '# debtholders', 'Original Names', 'count', '6p_Total', '6p_def_Total', '3p_Total',
                       'Location', 'Family Size', 'Slavecount', 'Match Status', 'Match Reason', 'url']]

In [344]:
NY_table.sort_values('3p_Total', ascending = False).to_csv('../../Data/Post1790/Aggregated/NY/NY_table.csv')

In [345]:
NY_table2.sort_values('3p_Total', ascending = False).to_csv('../../Data/Post1790/Aggregated/NY/NY_supp_table.csv')