In [1]:
import numpy as np
import pandas as pd
from rapidfuzz import process

Helper function section

In [2]:
# helper function: turn np.nan into ""
def deNaN(series):
    return series.apply(lambda x: "" if type(x) != str else x)

In [3]:
# helper function to combine columns with data into one column with all unique values
# we use this to combine name, town, state and person columns into just one column
def combineCols(df, num=3, namenum=3):
    change_df_agg = pd.DataFrame(columns=['old', 'new', 'type'])
    for col in ['town', 'state', 'occupation']:
        if num == 3:
            df[col] = [set([" ".join(x.split()) for x in [t1, t2, t3] if not pd.isnull(x)]) for t1, t2, t3 in
                       zip(df[col + '1'],
                           df[col + '2'],
                           df[col + '3'])]
        else:
            df[col] = [set([" ".join(x.split()) for x in [t1, t2] if not pd.isnull(x)]) for t1, t2 in zip(df[col + '1'],
                                                                                                          df[
                                                                                                              col + '2'])]
        if not any(df[col].apply(lambda x: len(x) > 1).tolist()):
            # change set to string
            print("reformatting {}".format(col))
            df[col] = df[col].apply(lambda x: x.pop() if x != set() else np.nan)
        else:
            print("{} column has multiple unique entries".format(col))
            print("see table at end for new entries")
            # keep the value that has the most characters, otherwise change set to string
            old = df[df[col].apply(lambda x: len(x) > 1)][col]
            df[col] = df[col].apply(
                lambda x: x.pop() if len(x) == 1 else np.nan if x == set() else max(list(x), key=len))
            # new dataframe to keep track of changes
            # create copy of change_df that removes all duplicates by turning old column, which is
            # of type set to type string
            change_df = pd.DataFrame([old, df.loc[old.index][col]]).T
            change_df.columns = ['old', 'new']
            change_df['type'] = col
            change_df_str = change_df.copy()
            change_df_str['old'] = change_df_str['old'].astype(str)
            change_df_str = change_df_str.drop_duplicates()
            # add filtered database of changes to aggregate dataset
            change_df_agg = pd.concat([change_df_agg, change_df.loc[change_df_str.index]])

    # add functions to combine asset totals, handle missing debt values
    if namenum == 3:
        df['Name 1'] = deNaN(df['First Name']) + " " + deNaN(df['Last Name'])
        df['Name 2'] = deNaN(df['First Name.1']) + " " + deNaN(df['Last Name.1'])
        df['Name 3'] = deNaN(df['First Name.2']) + " " + deNaN(df['Last Name.2'])
        df['Name'] = list(
            list(set([x.replace("  ", " ").strip() for x in [name1, name2, name3] if x.strip() != ""])) for
            name1, name2, name3 in zip(df['Name 1'], df['Name 2'], df['Name 3']))
    return df[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
               'town', 'state', 'occupation', 'Name']], change_df_agg

raw data import section

### Connecticut

In [15]:
CD_all = pd.DataFrame(columns=['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                               'town', 'state', 'occupation', 'Name', 'state_data'])
ASD_all = pd.DataFrame(columns=['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                                'town', 'state', 'occupation', 'Name', 'state_data'])

In [16]:
# importing desired columns and rename
CT_CD_raw = pd.read_excel("../data_raw/post1790/CT/CT_post1790_CD_ledger.xlsx",
                          header=13, usecols='H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_CD, change_df = combineCols(CT_CD_raw)
CT_CD.loc[CT_CD.query('state.isna()').index, 'state'] = 'CT'
CT_CD['state_data'] = 'CT'
CD_all = pd.concat([CD_all, CT_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
reformatting occupation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CT_CD['state_data'] = 'CT'


Unnamed: 0,old,new,type
526,"{Milford, Miford}",Milford,town
799,"{Nowich, Norwich}",Norwich,town


In [17]:
# importing desired columns and rename
CT_ASD_raw = pd.read_excel("../data_raw/post1790/CT/CT_post1790_ASD_ledger.xlsx",
                       header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_ASD, change_df = combineCols(CT_ASD_raw)
CT_ASD.loc[CT_ASD.query('state.isna()').index, 'state'] = 'CT'
CT_ASD['state_data'] = 'CT'
ASD_all = pd.concat([ASD_all, CT_ASD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CT_ASD['state_data'] = 'CT'


Unnamed: 0,old,new,type
125,"{Norwalk, Norwall}",Norwalk,town
811,"{1st Society in Lyme, Treasuer 1st Society in ...",Treasuer 1st Society in Lyme,occupation


### Georgia
No need to do additional cleaning because there's only one state/city/occupation column

In [18]:
# importing desired columns and rename
# prepare loan dataset
GA_CD_raw = pd.read_excel("../data_raw/post1790/GA/T694_GA_Loan_Office_CD.xlsx",
                      header = 10, usecols = 'Q, R, S, T, U, Z, AA, AB, AC, AD, AE')
# end of table
GA_CD_raw.drop([65, 66], inplace = True)
GA_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
# convert name column types
GA_CD_raw['Name'] = deNaN(GA_CD_raw['First Name']) + " " + deNaN(GA_CD_raw['Last Name'])
GA_CD_raw['Name'] = GA_CD_raw['Name'].apply(lambda x: [x])
GA_CD = GA_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation', 'Name']]
# impute state
GA_CD.loc[GA_CD.query('state.isna()').index, 'state'] = 'GA'
GA_CD['state_data'] = 'GA'
CD_all = pd.concat([CD_all, GA_CD])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GA_CD['state_data'] = 'GA'


### Maryland

In [19]:
#prepare loan dataset
MD_CD_raw = pd.read_excel("../data_raw/post1790/MD/MD_post1790_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
MD_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
MD_CD , change_df = combineCols(MD_CD_raw)
MD_CD.loc[MD_CD.query('state.isna()').index, 'state'] = 'MD'
MD_CD['state_data'] = 'MD'
CD_all = pd.concat([CD_all, MD_CD])
change_df

reformatting town
reformatting state
reformatting occupation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MD_CD['state_data'] = 'MD'


Unnamed: 0,old,new,type


In [20]:
# prepare loan dataset
MD_ASD_raw = pd.read_excel("../data_raw/post1790/MD/MD_post1790_ASD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
MD_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
MD_ASD , change_df = combineCols(MD_ASD_raw)
MD_ASD.loc[MD_ASD.query('state.isna()').index, 'state'] = 'MD'
MD_ASD['state_data'] = 'MD'
ASD_all = pd.concat([ASD_all, MD_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MD_ASD['state_data'] = 'MD'


Unnamed: 0,old,new,type


### North Carolina

In [21]:
NC_CD_raw = pd.read_excel("../data_raw/post1790/NC/T695_R4_NC_CD.xlsx",
                      header = 11, usecols = 'J, K, L, M, N, W, X, Z, AA, AC, AD ')
NC_CD_raw.drop([74, 75], inplace = True)
NC_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_CD_raw['Name'] = deNaN(NC_CD_raw['First Name'])  + " " + deNaN(NC_CD_raw['Last Name'])
NC_CD_raw['Name'] = NC_CD_raw['Name'].apply(lambda x: [x])
NC_CD = NC_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation', 'Name']]
NC_CD.loc[NC_CD.query('state.isna()').index, 'state'] = 'NC'
NC_CD['state_data'] = 'NC'
CD_all = pd.concat([CD_all, NC_CD])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NC_CD['state_data'] = 'NC'


In [22]:
NC_ASD_raw = pd.read_excel("../data_raw/post1790/NC/T695_R3_NC_ASD.xlsx", header = 10, usecols = 'H, I, J, K, L, P, Q, R, S, T, U')
NC_ASD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation',  '6p_Dollar', '6p_Cents',
                      '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_ASD_raw['Name'] = deNaN(NC_ASD_raw['First Name'])  + " " + deNaN(NC_ASD_raw['Last Name'])
NC_ASD_raw['Name'] = NC_ASD_raw['Name'].apply(lambda x: [x])
NC_ASD = NC_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation', 'Name']]
NC_ASD.loc[NC_ASD.query('state.isna()').index, 'state'] = 'NC'
NC_ASD['state_data'] = 'NC'
ASD_all = pd.concat([ASD_all, NC_ASD])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NC_ASD['state_data'] = 'NC'


### New Hampshire

In [23]:
#prepare loan dataset
NH_CD_raw = pd.read_excel("../data_raw/post1790/NH/T652_R6_New_Hampshire_CD.xlsx",
                      header = 10, usecols = 'I, J, K, L, M, N, O, P, Q, R, S').drop([219, 220])
NH_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                     '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NH_CD_raw['Name'] = deNaN(NH_CD_raw['First Name'])  + " " + deNaN(NH_CD_raw['Last Name'])
NH_CD_raw['Name'] = NH_CD_raw['Name'].apply(lambda x: [x])
NH_CD = NH_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                     'town', 'state', 'occupation', 'Name']]
NH_CD.loc[NH_CD.query('state.isna()').index, 'state'] = 'NH'
NH_CD['state_data'] = 'NH'
CD_all = pd.concat([CD_all, NH_CD])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NH_CD['state_data'] = 'NH'


In [24]:
NH_ASD_raw = pd.read_excel("../data_raw/post1790/NH/T652_New_Hampshire_ASD.xlsx", header = 12,
                       usecols = 'G, H, I, J, K, M, N, V, W, X, Y, Z, AA, AB, AK, AL, AM, AN')
NH_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                      'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                      'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
# unify occupation, town and state columns
NH_ASD, change_df = combineCols(NH_ASD_raw, 2)
NH_ASD.loc[NH_ASD.query('state.isna()').index, 'state'] = 'NH'
NH_ASD['state_data'] = 'NH'
ASD_all = pd.concat([ASD_all, NH_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NH_ASD['state_data'] = 'NH'


Unnamed: 0,old,new,type


### New York
Doesn't have town/occupation/state

### Pennsylvania

In [26]:
#prepare loan dataset
PA_CD_raw = pd.read_excel("../data_raw/post1790/PA/PA_post1790_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AJ, AK, AL, AM, AN, AO, AP')
PA_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
PA_CD, change_df = combineCols(PA_CD_raw)
PA_CD.loc[PA_CD.query('state.isna()').index, 'state'] = 'PA'
PA_CD['state_data'] = 'PA'
CD_all = pd.concat([CD_all, PA_CD])
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(change_df)

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PA_CD['state_data'] = 'PA'


Unnamed: 0,old,new,type
14,"{Carlisle Pennsylvania, Carlisle}",Carlisle Pennsylvania,town
39,"{Conecticutt, Connecticut}",Conecticutt,town
40,"{New Castle Delaware, Newcastle Delaware}",New Castle Delaware,town
113,"{Philadelphia County, Philadelphia}",Philadelphia County,town
131,"{State of Delaware, State of Delawere}",State of Delaware,town
210,"{Connecticutt, Connecticut}",Connecticutt,town
221,"{Bucks County, Bucks County Pennsylvania}",Bucks County Pennsylvania,town
267,"{Reading Berks County, Reading}",Reading Berks County,town
276,"{Reading Pennsylvania, Reading Berks County}",Reading Pennsylvania,town
293,"{Northumberland, Northumb Country}",Northumb Country,town


### Rhode Island

In [30]:
#prepare loan dataset
RI_CD_raw = pd.read_excel("../data_raw/post1790/RI/T653_Rhode_Island_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
RI_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
RI_CD, change_df = combineCols(RI_CD_raw)
RI_CD.loc[RI_CD.query('state.isna()').index, 'state'] = 'RI'
RI_CD['state_data'] = 'RI'
CD_all = pd.concat([CD_all, RI_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
reformatting occupation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RI_CD['state_data'] = 'RI'


Unnamed: 0,old,new,type
26,"{Gloucester, Glocester}",Gloucester,town
371,"{Chalestown, Charlestown}",Charlestown,town
435,"{North Kingston, North Kingstone}",North Kingstone,town
462,"{North Kingston, North Kingstown}",North Kingstown,town
499,"{Smithfeild, Smithfield}",Smithfeild,town


In [31]:
#prepare loan dataset
RI_ASD_raw = pd.read_excel("../data_raw/post1790/RI/T653_Rhode_Island_ASD.xlsx",
                          header = 11, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
RI_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
RI_ASD, change_df = combineCols(RI_ASD_raw)
RI_ASD.loc[RI_ASD.query('state.isna()').index, 'state'] = 'RI'
RI_ASD['state_data'] = 'RI'
ASD_all = pd.concat([CD_all, RI_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RI_ASD['state_data'] = 'RI'


Unnamed: 0,old,new,type


### South Carolina

In [33]:
#prepare loan dataset
SC_CD_raw = pd.read_excel("../data_raw/post1790/SC/Post_1790_South_Carolina_CD.xlsx",
                      header = 11, usecols = 'D, E, F, G, H, M, N, S, T, U, V, W, AB, AC, AH, AI, AJ, AK, AL, AQ, AR')
SC_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
SC_CD, change_df = combineCols(SC_CD_raw)
SC_CD.loc[SC_CD.query('state.isna()').index, 'state'] = 'SC'
SC_CD['state_data'] = 'SC'
CD_all = pd.concat([CD_all, SC_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SC_CD['state_data'] = 'SC'


Unnamed: 0,old,new,type
50,"{Camden Planter, Camden}",Camden Planter,town
94,"{Peedee, Pee Dee}",Pee Dee,town
270,"{Long Cames, Long Cane}",Long Cames,town
292,"{Charleston, Charlestom}",Charleston,town
45,"{as Executor to Henry Coram, as Executor to Jo...",as Executor to John Couturier,occupation
50,"{as Executor Ely Kershaw, Planter as Executor ...",Planter as Executor Ely Kershaw,occupation
182,"{Philip Hawkins, Executor Philip Hawkins}",Executor Philip Hawkins,occupation
256,"{as Guardian to Mary Deborah L. Gowdey, Guardi...",as Guardian to Mary Deborah L. Gowdey,occupation
305,"{Merchants, Charleston Merchants}",Charleston Merchants,occupation
383,"{Physician, Assignee of James Simons}",Assignee of James Simons,occupation


In [37]:
#prepare loan dataset
SC_ASD_raw = pd.read_excel("../data_raw/post1790/SC/Post_1790_South_Carolina_ASD_transfers_removed.xlsx", header = 11,
                       usecols = 'D, E, F, G, H, M, N, O')
SC_ASD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_def_Dollar','3p_Dollar']
for col in ['6p_', '6p_def_', '3p_']:
    SC_ASD_raw[col+'Cents'] = SC_ASD_raw[col+'Dollar'] - np.round(SC_ASD_raw[col+'Dollar'], 0)
    SC_ASD_raw[col+'Dollar'] = np.round(SC_ASD_raw['6p_Dollar'], 0)
SC_ASD = SC_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                     'town', 'state', 'occupation']]
SC_ASD.loc[SC_ASD.query('state.isna()').index, 'state'] = 'SC'
SC_ASD['state_data'] = 'SC'
ASD_all = pd.concat([ASD_all, SC_ASD])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SC_ASD['state_data'] = 'SC'


### Virginia

In [76]:
"""
VA CD not possible atm
"""

'\nVA CD not possible atm\n'

In [39]:
#prepare loan dataset
VA_ASD_raw = pd.read_excel("../data_raw/post1790/VA/VA_ASD.xlsx", header = 11,
                       usecols = 'D, E, F, G, N, O, U, V, W, X, AE, AF, AL, AM, AN, AO, AW, AX')
VA_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'occupation1', '6p_Dollar', '6p_Cents',
                  'First Name.1', 'Last Name.1', 'town2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'town3', 'occupation3', '3p_Dollar', '3p_Cents']
VA_ASD_raw['state1'] = np.nan
VA_ASD_raw['state2'] = np.nan
VA_ASD_raw['state3'] = np.nan
VA_ASD, change_df = combineCols(VA_ASD_raw)
VA_ASD.loc[VA_ASD.query('state.isna()').index, 'state'] = 'VA'
VA_ASD['state_data'] = 'VA'
ASD_all = pd.concat([ASD_all, VA_ASD])
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(change_df)

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VA_ASD.loc[VA_ASD.query('state.isna()').index, 'state'] = 'VA'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VA_ASD['state_data'] = 'VA'


Unnamed: 0,old,new,type
0,"{The Town Petersburg, Town of Petersburg}",The Town Petersburg,town
1,"{City of Richmond, Richmond}",City of Richmond,town
2,"{Dinwiddie County, Dinwiddie Country}",Dinwiddie Country,town
3,"{Town of Petersbung, The Town Petersburg, Town...",The Town Petersburg,town
4,"{Halifox County, Halifax County}",Halifox County,town
5,"{Petersbung, Town of Petersburg}",Town of Petersburg,town
6,"{City of Richmond, Richmond, The City of Richm...",The City of Richmond,town
8,"{Rockbridge Country, Rockbridge County}",Rockbridge Country,town
9,"{Buckingham Country, Buckingham County}",Buckingham Country,town
10,"{Rocksbridge County, Rockbridge County}",Rocksbridge County,town


## Mapping Town/City to Counties - INCOMPLETE
1. Connecticut: Referencing <a href = "https://ctstatelibrary.org/cttowns/counties">https://ctstatelibrary.org/cttowns/counties</a> I found that Huntington is now called Shelton and Chatham is now called East Hampton. The other two cases below are not mappable because those are not valid town names.
2. Georgia: Investigate more, very few counties

In [22]:
# fuzzy string matching function
def fuzzyMatch(unmatched_towns, towns, crosswalk, primary_dict, dict_matchcol = 'primary_city', initial = True, score_threshold = 85):
    if initial:
        print("\nFuzzy City name - county matches\n")
    else:
        print("\nFuzzy City name - county matches with string changes\n")
    printedtowns = []
    for town in unmatched_towns:
        # extract best match
        match_tuple = process.extractOne(town, [x for x in crosswalk[dict_matchcol] if not pd.isnull(x)])
        score = match_tuple[1]
        match = match_tuple[0]
        # if match above threshold, change + print match so we can hand check
        if score >= score_threshold:
            if dict_matchcol == 'primary_city':
                county = primary_dict[match]
            if dict_matchcol == 'county':
                county = match
            # add match, print out match
            if initial:
                print("{} -> {} in {}".format(town, match, county))
                town_index = towns[towns['town'] == town].index
                towns.loc[town_index, 'county'] = county
            else:
                original_town = towns[towns['town2'] == town]['town'].tolist()
                if town not in printedtowns:
                    print("{} (new name: {}) -> {} in {}".format(original_town, town, match, county))
                    printedtowns.append(town)
                town_index = towns[towns['town'].apply(lambda x: x in original_town)].index
                towns.loc[town_index, 'county'] = [county] * len(town_index)
    return towns

In [23]:
def directTownMatch(state_cw, towns, col = 'primary_city', towncol = 'town'):
    print("Direct City name - county matches\n")
    # match towns directly based off crosswalk
    primary_dict = dict(zip(state_cw[col],state_cw['county']))
    if col == 'primary_city':
        towns['county'] = towns[towncol].apply(lambda x: primary_dict.get(x, np.nan))
    if col == 'acceptable_cities':
        for ind in towns.index:
            town = towns.loc[ind, 'town']
            county = state_cw[state_cw[col].apply(lambda x: town in x if not pd.isnull(x) else False)]['county'].tolist()
            if len(county)>0:
                towns.loc[ind, 'county'] = county[0]
    t = towns[towns['county'].apply(lambda x: not pd.isnull(x))]
    if towncol == 'town':
        for tn, cty in zip(t['town'], t['county']):
            print("{} was matched to {} directly using the crosswalk".format(tn, cty))
    if towncol == 'town2':
        for tn, tn_og, cty in zip(t['town2'], t['town'], t['county']):
            print("{} (original: {}) was matched to {} directly using the crosswalk".format(tn, tn_og, cty))
    return primary_dict, towns

In [24]:
def directCountyMatch(state_cw, towns, towncol = 'town'):
    print("\nSome city names are actually county names")
    if towncol == 'town':
        print("Direct City (county) name - county matches\n")
    if towncol == 'town2':
        print("Direct City (county) name with string changes - county matches\n")
    # some own names are actually counties
    # match towns based off whether town name is actually county name in crosswalk
    counties = state_cw['county'].unique()
    nanindex = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
    towns.loc[nanindex, 'county'] = towns.loc[nanindex, towncol].apply(lambda x: x if x in counties.tolist() else np.nan)
    towns2 = towns.loc[nanindex]
    nanindex2 = towns2[towns2['county'].apply(lambda x: not pd.isnull(x))].index
    for t, c in zip(towns2.loc[nanindex2, towncol], towns2.loc[nanindex2, 'county']):
        print("{} was matched to {} using the crosswalk".format(t, c))
    return towns

In [25]:
# change column of town dataframe's type to either town or county
def addType(towns, type = 'town'):
    towns['name_type'] = [name_type if not pd.isnull(name_type) else type if not pd.isnull(county) else np.nan for name_type, county in
                          zip(towns['name_type'], towns['county'])]
    return towns

In [26]:
city_county_cw = pd.read_excel('../../Data/CensusData/zip_code_database.xls')[['primary_city', 'acceptable_cities',
                                                                               'unacceptable_cities', 'county', 'state']]

In [27]:
final_cw = pd.DataFrame(columns = ['town', 'county', 'state', 'name_type'])
list_of_states = ['CT', 'GA', 'MD', 'NC', 'NH', 'NJ', 'PA', 'RI', 'SC',
                  'MA', 'VA', 'DE']

for state in list_of_states:
    print("\n{} MATCHING \n".format(state))
    # create list of towns for each state
    towns = CD_all[CD_all['state'] == state][['town']].drop_duplicates()
    towns = towns[towns['town'].apply(lambda x: not pd.isnull(x))]
    # state crosswalk
    state_cw = city_county_cw[city_county_cw['state'] == state]
    if state == 'VA':
        state_cw = city_county_cw[city_county_cw['state'].apply(lambda x: x in ['VA', 'WV'])]
    state_cw = state_cw[state_cw['county'].apply(lambda x: 'county' in x.lower() if not pd.isnull(x) else False)]
    # try direct match: town name -> crosswalk town-county
    oldtowns = towns.copy()
    primary_dict, towns = directTownMatch(state_cw, towns, col = 'primary_city', towncol = 'town')
    # label name type
    towns['name_type'] = towns['county'].apply(lambda x: 'town' if not pd.isnull(x) else np.nan)

    if state == 'CT':
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town']
        towns = fuzzyMatch(unmatched_towns1, towns, state_cw, primary_dict, dict_matchcol = 'primary_city', initial = True, score_threshold = 85)
        towns = addType(towns)

        # modify town names - towns changed names (see CT note)
        # retry fuzzy match: town name -> crosswalk town-county
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Huntington', 'Shelton').replace('Chatham', 'East Hampton'))
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

    if state == 'GA':
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')
    if state == 'MD':
        # remove instances where Maryland is mentioned and unabbreviate county abbreviations
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Maryland', '').replace('Co ', 'County').strip())
        towns = directCountyMatch(state_cw, towns, towncol = 'town2')
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol = 'county', initial = False, score_threshold = 86)
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)


        # correct a matching - Baltimore City to Baltimore County
        print("Baltimore City changed to Baltimore County")
        towns['county'] = towns['county'].apply(lambda x: x.replace('City', 'County') if not pd.isnull(x) else x)
        towns = addType(towns)
    if state == 'NC':
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')
        # remove instances where North Carolina is mentioned and rename Tarborugh to enable matching
        towns['town2'] = towns['town'].apply(lambda x: x.replace('North Carolina', '').replace('Tarborugh', 'Tarboro').strip())
        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)
    if state == 'NH':
        # use acceptable_cities instead of primary_cities column to match in the crosswalk
        # try direct match: town name -> crosswalk town-county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town')
        towns.loc[null_ind] = tn
        towns = addType(towns)
        # remove instances where New Hampshire and other geo-jurisdictional terms are used
        # rename Rockingham to enable matching
        towns['town2'] = towns['town'].apply(lambda x: x.replace('State', '').replace('New Hampshire', '').replace('of ','').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Rockingham', 'Rockingham County').strip())
        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)
        # some "town" names are actually counties
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town2')
        towns = addType(towns, 'county')
        # manual fixes for matches
        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county in zip(['Brintwood', 'Portsmouth New Hampshire'],
                                ['Rockingham County', 'Rockingham County']):
            print("{} was matched to {}".format(town, county))
            if town == 'Brintwood':
                towns.loc[towns[towns['town'] == town].index, ['county', 'name_type']] = [county,'town]']
            else:
                towns.loc[towns[towns['town'] == town].index, ['county', 'name_type']] = [county,'county']
    if state == 'NJ':
        # remove instances where New Jersey is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('New Jersey', '').strip())

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)
        # some "town" names are actually counties
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town2')
        towns = addType(towns, 'county')
    if state == 'PA':
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')

        # use acceptable_cities instead of primary_cities column to match in the crosswalk
        # try direct match: town (county) name -> crosswalk county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town')
        towns.loc[null_ind] = tn
        towns = addType(towns, 'county')

        # remove instances where New Jersey is used, fix some notational issues
        # correct Dauphincoy to Dauphin and categorize Tulpehocken as being in Berks County
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Co ', 'County').replace('Delaware', 'Delaware County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Pennsylvania', '').replace('County County','County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Country', 'County').replace('Dauphincoy','Dauphin').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Tulpehocken', 'Berks County').strip())

        # categorize different Philadelphia neighborhoods as belonging in Philadelphia
        philreptowns = ['Blockley', 'Northan Liberties', 'Northern Liberties', \
                        'The Northern Libert', 'Passyunk', 'German Town', 'Southwark', 'Borden Town'] # not sure on this last one...
        for town in philreptowns:
            towns['town2'] = towns['town2'].apply(lambda x: x.replace(town, 'Philadelphia'))
        towns = addType(towns)
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = directCountyMatch(state_cw, towns, 'town2')
        towns = addType(towns, 'county')
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')

        # use acceptable_cities instead of primary_cities column to match in the crosswalk
        # try direct match: town (county) name -> crosswalk county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town')
        towns.loc[null_ind] = tn
        towns = addType(towns, 'county')

        # remove instances where New Jersey is used, fix some notational issues
        # correct Dauphincoy to Dauphin and categorize Tulpehocken as being in Berks County
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Co ', 'County').replace('Delaware', 'Delaware County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Pennsylvania', '').replace('County County','County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Country', 'County').replace('Dauphincoy','Dauphin').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Tulpehocken', 'Berks County').strip())

        # categorize different Philadelphia neighborhoods as belonging in Philadelphia
        philreptowns = ['Blockley', 'Northan Liberties', 'Northern Liberties',
                        'The Northern Libert', 'Passyunk', 'German Town', 'Southwark', 'Borden Town'] # not sure on this last one...
        for town in philreptowns:
            towns['town2'] = towns['town2'].apply(lambda x: x.replace(town, 'Philadelphia'))
        towns = addType(towns)
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = directCountyMatch(state_cw, towns, 'town2')
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = [x for x in towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2'] if x != '']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol = 'county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county, s in zip(['Charleston South Carolina', 'Burlington New Jersey', 'Northumberland County Virginia'],
                                       ['Charleston County', 'Burlington County', 'Northumberland County'],
                                       ['SC', 'NJ', 'VA']):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, ['county', 'state']] = [county, s]
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = [x for x in towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2'] if x != '']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol = 'county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county, s in zip(['Charleston South Carolina', 'Burlington New Jersey', 'Northumberland County Virginia'],
                                       ['Charleston County', 'Burlington County', 'Northumberland County'],
                                       ['SC', 'NJ', 'VA']):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, ['county', 'state']] = [county, s]
        towns = addType(towns, 'county')
    if state == 'RI':
        # remove instances where Rhode Island and other geo-jurisdictional terms are used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Rhode Island', '').replace('State ', '').replace('of', '').strip())
        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county in zip(['Gloucester', 'Richmond'],
                                ['Providence County', 'Washington County']):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, 'county'] = county
        towns = addType(towns)

    if state == 'SC':
        # remove instances where South Carolina is used, change number to character
        towns['town2'] = towns['town'].apply(lambda x: x.replace('South Carolina', '').replace('96', 'Ninety six').strip())

        # use modified town names
        # use acceptable_cities column
        # try fuzzy match: town (county) name -> crosswalk county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town2')
        towns.loc[null_ind] = tn
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # some town names are actually county names
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol ='county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        townlist = ['St. Paul\'s', 'Pee Dee', 'St. George', 'New River', 'Winyaw', 'Broad River', \
                    'Toogoodoo', 'St Pauls', 'Savannah', \
                    'James Island', 'St Andrews'] # last two are manual fixes
        countylist = ['Clarendon County', 'Marion County', 'Dorchester County', 'Beaufort County', 'Georgetown County', 'Beaufort County',
                      'Charleston County', 'Clarendon County', 'Chatham County', \
                      'Charleston County', 'Richland County']
        for town, county in zip(townlist, countylist):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, 'county'] = county
        towns = addType(towns)

        towns.loc[towns[towns['town'] == 'Savannah'].index, 'state'] =  'GA'

    if state == 'MA':
        # remove instances where Massachusetts, MA or State is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('MA', '').replace('Massachusetts', '').replace('State','').strip())
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

    if state == 'VA':
        # remove instances where Massachusetts, MA or State is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('VA', '').replace('Virginia', '').replace('Virgina','').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('State', '').replace(' of ', '').strip())
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # some town names are actually county names
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol ='county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        townlist = ['Portsmouth Virginia'] # last two are manual fixes
        countylist = ['Norfolk County']
        for town, county in zip(townlist, countylist):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, 'county'] = county
        towns = addType(towns)
    if state == 'DE':
        # remove instances where Massachusetts, MA or State is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Delaware', '').replace('State', '').replace(' of ', '').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Kent Company', 'Kent County').strip())
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # some town names are actually county names
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol ='county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

    # print out all unmatched names
    print("\nFinal Unmatched Names\n")
    t = towns[towns['county'].apply(lambda x: pd.isnull(x))]
    for tn in t['town']:
        print("{} was unable to be matched".format(tn))
    towns = towns[towns['county'].apply(lambda x: not pd.isnull(x))]
    towns['state'] = state

    # only Georgia doesn't have a town2 column
    if state not in ['GA']:
        towns.drop('town2', axis = 1, inplace = True)

    # correct states for certain counties/cities in PA and SC
    if state == 'PA':
        towns.loc[towns[towns['town'] == 'Charleston South Carolina'].index, 'state'] ='SC'
        towns.loc[towns[towns['town'] == 'Northumberland County Virginia'].index, 'state'] = 'VA'
    if state == 'SC':
        towns.loc[towns[towns['town'] == 'Savannah'].index, 'state'] ='GA'


    final_cw = pd.concat([final_cw, towns])


CT MATCHING 

Direct City name - county matches

Hartford was matched to Hartford County directly using the crosswalk
Bolton was matched to Tolland County directly using the crosswalk
Wethersfield was matched to Hartford County directly using the crosswalk
New Haven was matched to New Haven County directly using the crosswalk
Farmington was matched to Hartford County directly using the crosswalk
New London was matched to New London County directly using the crosswalk
Cornwall was matched to Litchfield County directly using the crosswalk
Stamford was matched to Fairfield County directly using the crosswalk
East Hartford was matched to Hartford County directly using the crosswalk
Bristol was matched to Hartford County directly using the crosswalk
Lebanon was matched to New London County directly using the crosswalk
Windsor was matched to Hartford County directly using the crosswalk
Suffield was matched to Hartford County directly using the crosswalk
Berlin was matched to Hartford County

In [28]:
final_cw.reset_index(inplace = True, drop = True)

In [29]:
# manually input county, state and name type labels
towns = ['Colchester', 'Charleston South Carolina', 'Philadelphia', 'Albany', 'Newark', 'Northumberland County Virginia', 'Savannah', 'City of New York', 'Long Island', 'Portsmouth Virginia']
counties = ['New London County', 'Berkeley County', 'Philadelphia County', 'Albany County', 'Essex County', 'Northumberland County', 'Chatham County', 'New York County', \
            np.nan, 'Rockingham County']
states = ['CT', 'SC', 'PA', 'NY', 'NJ', 'VA', 'GA', 'NY', 'NY', 'VA']
for town, county, state in zip(towns, counties, states):
    if town == 'Northumberland County Virginia':
        final_cw.loc[final_cw[final_cw['town'] == town].index, ['county', 'state', 'name_type']] = [county, state, 'county']
    elif pd.isnull(county):
        final_cw.loc[final_cw[final_cw['town'] == town].index, ['county', 'state', 'name_type']] = [county, state, 'other']
    else:
        final_cw.loc[final_cw[final_cw['town'] == town].index, ['county', 'state', 'name_type']] = [county, state, 'town']

In [30]:
# manually input county, state and name type labels
colonies = ['New Hampshire', 'Massachusetts', 'Rhode Island', 'Connecticut', 'Conecticutt', 'New York', 'New Jersey', 'Pennsylvania', 'Delaware', 'Maryland', \
            'Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Vermont', 'Delawere', 'Virgina']
abbrev = ['NH', 'MA', 'RI', 'CT', 'CT', 'NY', 'NJ', 'PA', 'DE', 'MD', 'VA', 'NC', 'SC', 'GA', 'VT', 'DE', 'VA']
for colony, abrv in zip(colonies, abbrev):
    ind = np.max(final_cw.index)+1
    final_cw.loc[ind, ['town','county', 'state', 'name_type']] = [colony, np.nan, abrv, 'state']

In [31]:
# manual replacement of counties that changed names or boders
towns = ['Gilmantown', np.nan, 'Berlington New Jersey', np.nan, np.nan, np.nan, np.nan, np.nan, '96 District', \
         'Ninety six District', np.nan, np.nan, np.nan, np.nan, np.nan, 'Trenton New Jersey', 'Princeton New Jersey',
         'Newbury', 'Pembroke', 'Hopkinton', 'Salisbury', 'Canterbury', 'Concord', np.nan, np.nan, np.nan]
oldcounties = ['Belknap', 'Berkeley', 'Camden', 'Carroll', 'Columbia', 'Elk', 'Dorchester', 'Grant', 'Greenwood', \
               'Greenwood', 'Hampden', 'Kershaw', 'King and Queen', 'Lycoming', 'Marion', 'Mercer', 'Mercer',
               'Merrimack', 'Merrimack', 'Merrimack', 'Merrimack', 'Merrimack', 'Merrimack', 'Norfolk', 'Perry', 'Sullivan']
newcounties = ['Strafford', 'Charleston', 'Burlington', 'Strafford', 'Richmond', 'Northumberland', 'Charleston', 'Hampshire', 'Abbeville', \
               'Laurens', 'Hampshire', 'Lancaster', np.nan, 'Northumberland', 'Prince George\'s', 'Hunterdon', 'Middlesex',
               'Hillsborough', 'Hillsborough', 'Hillsborough', 'Hillsborough', 'Rockingham', 'Grafton', 'Suffolk', 'Cumberland', 'Cheshire']
for town, oldcounty, newcounty in zip(towns, oldcounties, newcounties):
    if not pd.isnull(town):
        ind = final_cw[[t == town and oldcounty + ' County' == oc for t, oc in zip(final_cw['town'], final_cw['county'])]].index
    else:
        ind = final_cw[final_cw['county'] == oldcounty + ' County'].index
    if pd.isnull(newcounty):
        final_cw.loc[ind, 'county']  = newcounty
    else:
        final_cw.loc[ind, 'county']  = newcounty + ' County'
    if len(ind) == 0:
        print(town, oldcounty)
#make corrections
ind = final_cw[[c == 'Charleston County' and s == 'MD' for c, s in zip(final_cw['county'], final_cw['state'])]].index
final_cw.loc[ind, 'county'] = 'Dorchester County'

ind = final_cw[[c == 'Prince George\'s County' and s == 'SC' for c, s in zip(final_cw['county'], final_cw['state'])]].index
final_cw.loc[ind, 'county'] = 'Georgetown County'

Berlington New Jersey Camden


In [32]:
# manual fixes of assignments
final_cw.loc[final_cw[final_cw['county'] == 'Prince George\'s County'].index, 'county'] = 'Prince Georges County'
final_cw.loc[final_cw[final_cw['county'] == 'Queen Anne\'s County'].index, 'county'] = 'Queen Annes County'
final_cw.loc[final_cw[final_cw['county'] == 'St Mary\'s County'].index, 'county'] = 'St Marys County'

# more manual fixes
final_cw.loc[final_cw[final_cw['town'] == 'Doden Maryland'].index, ['county', 'name_type']] = ['Anne Arundel County', 'county']
final_cw.loc[final_cw[final_cw['town'] == 'Huntington New Jersey'].index, ['county', 'name_type']] = ['Hunterdon County', 'county']
# adding state labels
final_cw.loc[final_cw[[pd.isnull(t) and not pd.isnull(s) and s != 'FR' for t, s in zip(final_cw['town'], final_cw['state'])]].index, 'name_type'] = 'state'

final_cw.loc[final_cw[final_cw['town'] == 'Kittery'].index, ['county', 'name_type']] = ['York County', 'town']
final_cw.loc[final_cw[final_cw['town'] == 'Kensignton'].index, ['county', 'name_type']] = ['Philadelphia County', 'neighborhood']
final_cw.loc[final_cw[final_cw['town'] == 'York'].index, ['county', 'name_type']] = ['York County', 'town']
final_cw.loc[final_cw[final_cw['town'] == 'Wells'].index, ['county', 'name_type']] = ['York County', 'town']
final_cw.loc[final_cw[final_cw['town'] == 'James City County Virginia'].index, ['county']] = ['James City County']

final_cw.loc[final_cw[final_cw['town'] == 'Cumb County Pennsylvania'].index, ['county']] = ['Cumberland County']
final_cw.loc[final_cw[final_cw['town'] == 'Cumberland'].index, 'name_type'] = 'county'
final_cw.loc[final_cw[final_cw['town'] == 'york town pennsylvania'].index, ['county', 'name_type']] = ['Philadelphia County', 'town']

# Manual fixes of name_type
final_cw.loc[final_cw['town'] == 'Virginia and Philadelphia', 'name_type'] = 'state'
town_ind = final_cw[[nt == 'county' and ('County' not in c and 'Co ' not in c and 'Country' not in c and
                                       'Talbot' not in c and 'Rockingham' not in c and 'Delaware' not in c)
                       if not pd.isnull(c) else False for nt, c in zip(final_cw['name_type'], final_cw['town'])]].index
final_cw.loc[town_ind, 'name_type'] = 'town'

# some chester counties are mislabelled
chester_ind = final_cw[final_cw['town'].apply(lambda x: 'Chester' in x and 'Massachusetts' not in x if not pd.isnull(x) else False)].index
final_cw.loc[chester_ind, ['county', 'name_type']] = ['Chester County', 'county']

# correctly label some counties

county_ind = final_cw[[nt == 'town' and ('County of Philadelphia' in c or 'Paxton Tot Dauphin County' in c or 'Somerset County New Jersey' in c)
                     for nt, c in zip(final_cw['name_type'], final_cw['town'])]].index
final_cw.loc[county_ind, 'name_type'] = 'county'

In [34]:
# drop duplicates that occur for some reason
final_cw.drop_duplicates(subset = ['town', 'state'], inplace = True)
# add labels for county and name type to CD_all
CD_all = pd.merge(CD_all, final_cw, on = ['town', 'state'], how = 'left')

In [35]:
CD_all.loc[CD_all[CD_all['name_type'].apply(lambda x: pd.isnull(x))].index, 'name_type'] = 'other'

In [36]:
CD_all.loc[CD_all.query('town == "City of New York" and name_type == "other"').index, ['county', 'name_type']] = ['New York County', 'town']
CD_all.loc[CD_all[CD_all['town'].apply(lambda x: "State" in x if not pd.isnull(x) else False)].index, ['name_type']] = ['state']
CD_all.loc[CD_all.query('town == "Vermont"').index, ['state', 'name_type']] = ['VT', 'state']
# assigning unassinable towns as states
CD_all.loc[CD_all.query('town == "North Hampshire" and name_type == "other"').index, ['name_type']] = ['state_flag']
CD_all.loc[CD_all.query('town == "Long Island" and name_type == "other"').index, ['name_type']] = [ 'state']
CD_all.loc[CD_all.query('town == "Carolina" and name_type == "other"').index, ['name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Doden" and name_type == "other"').index, [ 'name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Isaac" and name_type == "other"').index, [ 'name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Kittery" and name_type == "other"').index, [ 'name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Glouster" and name_type == "other"').index, ['name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "York" and name_type == "other"').index, [ 'name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Wells" and name_type == "other"').index, [ 'name_type']] = ['state_flag']
CD_all.loc[CD_all.query('town == "W Callisters" and name_type == "other"').index, ['name_type']] = ['state_flag']
CD_all.loc[CD_all.query('town == "Huntington" and name_type == "other"').index, [ 'county', 'name_type']] = [ 'Hunterdon County', 'county']
CD_all.loc[CD_all.query('town == "Connecticutt" and name_type == "other"').index, [ 'name_type']] = [  'county']
CD_all.loc[CD_all.query('town == "Charleston South Carolina" and name_type == "other"').index, [ 'county', 'name_type']] = ['Charleston County', 'town']
CD_all.loc[CD_all.query('town == "Albany" and name_type == "other"').index, ['county', 'name_type']] = ['Albany County', 'town']
CD_all.loc[CD_all.query('town == "Northumberland" and name_type == "other"').index, ['county', 'name_type']] = [ 'Northumberland County', 'town']
CD_all.loc[CD_all.query('town == "Springfield" and name_type == "other"').index, ['name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Long Cames" and name_type == "other"').index, [ 'name_type']] = [ 'state_flag']
CD_all.loc[CD_all.query('town == "Savannah" and name_type == "other"').index, [ 'name_type']] = [ 'state_flag']

In [37]:
CD_all.loc[CD_all.query('name_type == "other" and town.isna() and county.isna() and not state.isna() and state != "FR"').index, 'name_type'] = 'state'

In [38]:
# fix state
ind = CD_all.query('name_type == "other" and not state.isna() and state != "BVI" and state != "BM" and state != "GB" and state != "VI" and state != "FR" and state != "US"').index
CD_all.loc[ind, 'name_type'] = 'state'
CD_all.loc[CD_all[CD_all['name_type'] == 'town]'].index, 'name_type'] = 'town'

In [39]:
CD_all['6p_total'] = CD_all['6p_Dollar'] + CD_all['6p_Cents']/100

In [40]:
grouped_assets = CD_all.groupby(['county', 'state']).agg({'6p_total': ['sum', 'count', 'mean']})
grouped_assets.columns = grouped_assets.columns.map('_'.join).str.strip('_')
grouped_assets.columns = ['6p_total', 'debtholder_county_count', 'mean_6p_held']
grouped_assets = grouped_assets.reset_index()

In [41]:
countyPop = pd.read_csv('../../Data/CensusData/countyPopulation.csv').drop(0).reset_index(drop = True)
sub_cols =['Area Name', 'State/US Abbreviation', 'Total Population', 'White Male', 'White Male Age 16 Years and over']
county_subset = countyPop[sub_cols]
merged_geography = pd.merge(county_subset, grouped_assets, left_on = ['Area Name', 'State/US Abbreviation'], right_on=['county', 'state'], how = 'right')
final_asset_data = merged_geography.drop(['Area Name', 'State/US Abbreviation'], axis = 1)
final_asset_data['debt_per_capita'] = final_asset_data['6p_total']/final_asset_data['Total Population'].apply(lambda x: float(x))
final_asset_data['debt_per_white_male'] = final_asset_data['6p_total']/final_asset_data['White Male'].apply(lambda x: float(x))

In [44]:
CD_all

Unnamed: 0,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents,town,state,occupation,Name,state_data
0,1064.0,75.0,532.0,37.0,508.0,51.0,Hartford,CT,Merchant,[Samuel W Pomeroy],CT
1,449.0,96.0,224.0,97.0,232.0,10.0,Bolton,CT,Farmer,[Benjamin Trumbull],CT
2,154.0,20.0,77.0,10.0,192.0,,Rhode Island,RI,Farmer,[Richard Green],CT
3,196.0,75.0,98.0,37.0,172.0,24.0,Hartford,CT,Merchant,[Thomas Hopkins],CT
4,53.0,58.0,26.0,79.0,67.0,6.0,Hartford,CT,Merchant,[John Morgan],CT
...,...,...,...,...,...,...,...,...,...,...,...
423,120.0,25.0,60.0,12.0,97.0,70.0,New York,NY,Merchant,[Robert Ross Waddell],SC
424,33.0,33.0,16.0,67.0,31.0,67.0,Maryland,MD,Shopkeeper,[George Parker],SC
425,124.0,33.0,62.0,17.0,118.0,41.0,Maryland,MD,Merchant,[Edward Ireland],SC
426,242.0,67.0,121.0,33.0,79.0,52.0,Charleston,SC,,[Simeon Theus],SC


In [42]:
# for david to use when doing histograms of occupations vs no occupations and grouping occupations
CD_all.to_csv('../data_raw/post1790/Aggregated/raw/aggregated_CD.csv')
# for Jiacheng to use when making table
CD_all[['town', 'state', 'county', 'name_type']].drop_duplicates().to_csv('../../Data/AssetGeography/county_cw.csv')
# for Maria to use when making maps
final_asset_data.to_csv('../../Data/AssetGeography/county_debt_total.csv')

In [43]:
print(CD_all[CD_all['county'].apply(lambda x: not pd.isnull(x))]['6p_Dollar'].sum()/CD_all['6p_Dollar'].sum())
print(CD_all[CD_all['county'].apply(lambda x: not pd.isnull(x))].shape[0]/CD_all.shape[0])

0.6574815625923992
0.808598216327464
