# Outline
1. Import continental and assumed debt for each state
2. Map towns to county names using mapping file + fuzzy string matching
3. Use county shape files to create maps

## Documentation Notes
1. All non-exact matches will be printed out when doing merges (exact matches are not printed)

## Cleaning Questions
1. What do we do about an entry that has entries for some debt types, but not for others? Do we want to drop those entries, impute the values, or set NA values to 0?
2. What do we do about entries that have a state, but no town (ie: State of Connecticut)? What about no state?

In [1]:
import numpy as np
import pandas as pd
from rapidfuzz import process

In [2]:
def combineCols(df, num = 3):
    change_df_agg = pd.DataFrame(columns = ['old', 'new', 'type'])
    for col in ['town', 'state', 'occupation']:
        # " ".join(x.split()) removes all excess whitespace
        # creates set with all unique instances of column
        if num == 3:
            df[col] = [set([" ".join(x.split()) for x in [t1, t2, t3] if not pd.isnull(x)]) for t1, t2, t3 in zip(df[col+'1'],
                                                                                                                  df[col+'2'],
                                                                                                                  df[col+'3'])]
        else:
            df[col] = [set([" ".join(x.split()) for x in [t1, t2] if not pd.isnull(x)]) for t1, t2 in zip(df[col+'1'],
                                                                                                          df[col+'2'])]
        if not any(df[col].apply(lambda x: len(x) > 1).tolist()):
            # change set to string
            print("reformatting {}".format(col))
            df[col] = df[col].apply(lambda x: x.pop() if x != set() else np.nan)
        else:
            print("{} column has multiple unique entries".format(col))
            print("see table at end for new entries")
            # keep the value that has the most characters, otherwise change set to string
            old = df[df[col].apply(lambda x: len(x) > 1)][col]
            df[col] = df[col].apply(lambda x: x.pop() if len(x) == 1 else np.nan if x == set() else max(list(x), key=len))
            # new dataframe to keep track of changes
            # create copy of change_df that removes all duplicates by turning old column, which is
            # of type set to type string
            change_df = pd.DataFrame([old, df.loc[old.index][col]]).T
            change_df.columns = ['old', 'new']
            change_df['type'] = col
            change_df_str = change_df.copy()
            change_df_str['old'] = change_df_str['old'].astype(str)
            change_df_str = change_df_str.drop_duplicates()
            # add filtered database of changes to aggregate dataset
            change_df_agg = pd.concat([change_df_agg, change_df.loc[change_df_str.index]])

    # add functions to combine asset totals, handle missing debt values

    return df[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
               'town', 'state', 'occupation']], change_df_agg

In [3]:
"""ind1, ind2, ind3 = CT_CD[['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index,
                   CT_CD[['6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index,
                   CT_CD[['3p_Dollar', '3p_Cents']].dropna(thresh = 1).index
ind = set(ind1).intersection(ind2).intersection(ind3)"""

"ind1, ind2, ind3 = CT_CD[['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index,\n                   CT_CD[['6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index,\n                   CT_CD[['3p_Dollar', '3p_Cents']].dropna(thresh = 1).index\nind = set(ind1).intersection(ind2).intersection(ind3)"

## Import Data
1. Import CD and ASD for each state, combine the multiple town/state/occupation columns (if they exist) into one
2. Concatenate all the separate datasets into two (one CD and one ASD)

### Connecticut

In [4]:
CD_all = pd.DataFrame(columns = ['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                                  'town', 'state', 'occupation'])
ASD_all = pd.DataFrame(columns = ['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                                  'town', 'state', 'occupation'])

In [5]:
# importing desired columns and rename
CT_CD_raw = pd.read_excel("../../Data/Post1790/CT/CT_post1790_CD_ledger.xlsx",
                      header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_CD, change_df = combineCols(CT_CD_raw)
CD_all = pd.concat([CD_all, CT_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
reformatting occupation


Unnamed: 0,old,new,type
526,"{Miford, Milford}",Milford,town
799,"{Norwich, Nowich}",Norwich,town


In [6]:
# importing desired columns and rename
CT_ASD_raw = pd.read_excel("../../Data/Post1790/CT/CT_post1790_ASD_ledger.xlsx",
                       header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_ASD, change_df = combineCols(CT_ASD_raw)
ASD_all = pd.concat([ASD_all, CT_ASD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
125,"{Norwall, Norwalk}",Norwall,town
811,"{1st Society in Lyme, Treasuer 1st Society in ...",Treasuer 1st Society in Lyme,occupation


### Georgia
No need to do additional cleaning because there's only one state/city/occupation column

In [7]:
# importing desired columns and rename
# prepare loan dataset
GA_CD_raw = pd.read_excel("../../Data/Post1790/GA/T694_GA_Loan_Office_CD.xlsx",
                      header = 10, usecols = 'Q, R, S, T, U, Z, AA, AB, AC, AD, AE')
GA_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
GA_CD = GA_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, GA_CD])

### Maryland

In [8]:
#prepare loan dataset
MD_CD_raw = pd.read_excel("../../Data/Post1790/MD/MD_post1790_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
MD_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
MD_CD , change_df = combineCols(MD_CD_raw)
CD_all = pd.concat([CD_all, MD_CD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


In [9]:
#prepare loan dataset
MD_ASD_raw = pd.read_excel("../../Data/Post1790/MD/MD_post1790_ASD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
MD_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
MD_ASD , change_df = combineCols(MD_ASD_raw)
ASD_all = pd.concat([ASD_all, MD_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


### North Carolina

In [10]:
NC_CD_raw = pd.read_excel("../../Data/Post1790/NC/T695_R4_NC_CD.xlsx",
                      header = 11, usecols = 'J, K, L, M, N, W, X, Z, AA, AC, AD ')
NC_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_CD = NC_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, NC_CD])

In [11]:
NC_ASD_raw = pd.read_excel("../../Data/Post1790/NC/T695_R3_NC_ASD.xlsx", header = 10, usecols = 'H, I, J, K, L, P, Q, R, S, T, U')
NC_ASD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation',  '6p_Dollar', '6p_Cents',
                      '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_ASD = NC_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                         'town', 'state', 'occupation']]
ASD_all = pd.concat([ASD_all, NC_ASD])

### New Hampshire

In [12]:
#prepare loan dataset
NH_CD_raw = pd.read_excel("../../Data/Post1790/NH/T652_R6_New_Hampshire_CD.xlsx",
                      header = 10, usecols = 'I, J, K, L, M, N, O, P, Q, R, S').drop([219, 220])
NH_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                     '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NH_CD = NH_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, NH_CD])

In [13]:
NH_ASD_raw = pd.read_excel("../../Data/Post1790/NH/T652_New_Hampshire_ASD.xlsx", header = 12,
                       usecols = 'G, H, I, J, K, M, N, V, W, X, Y, Z, AA, AB, AK, AL, AM, AN')
NH_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                      'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                      'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
# unify occupation, town and state columns
NH_ASD, change_df = combineCols(NH_ASD_raw, 2)
ASD_all = pd.concat([ASD_all, NH_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


### New York
Doesn't have town/occupation/state

In [14]:
"""#prepare loan dataset
NY_CD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_CD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_CD_raw['state'] = np.nan
NY_CD_raw['town'] = np.nan
NY_CD_raw['occupation'] = np.nan
NY_CD = NY_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, NY_CD])
"""

'#prepare loan dataset\nNY_CD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",\n                      header = 11, usecols = \'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS\')\nNY_CD_raw.columns = [\'First Name\', \'Last Name\', \'6p_Dollar\', \'6p_Cents\',\n                 \'First Name.1\', \'Last Name.1\', \'6p_def_Dollar\', \'6p_def_Cents\',\n                 \'First Name.2\', \'Last Name.2\', \'3p_Dollar\', \'3p_Cents\']\nNY_CD_raw[\'state\'] = np.nan\nNY_CD_raw[\'town\'] = np.nan\nNY_CD_raw[\'occupation\'] = np.nan\nNY_CD = NY_CD_raw[[\'6p_Dollar\', \'6p_Cents\', \'6p_def_Dollar\', \'6p_def_Cents\',\'3p_Dollar\', \'3p_Cents\',\n                   \'town\', \'state\', \'occupation\']]\nCD_all = pd.concat([CD_all, NY_CD])\n'

In [15]:
"""#prepare loan dataset
NY_ASD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",
                          header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_ASD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_ASD_raw['state'] = np.nan
NY_ASD_raw['town'] = np.nan
NY_ASD_raw['occupation'] = np.nan
NY_ASD = NY_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
ASD_all = pd.concat([ASD_all, NY_ASD])
"""

'#prepare loan dataset\nNY_ASD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",\n                          header = 11, usecols = \'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS\')\nNY_ASD_raw.columns = [\'First Name\', \'Last Name\', \'6p_Dollar\', \'6p_Cents\',\n                     \'First Name.1\', \'Last Name.1\', \'6p_def_Dollar\', \'6p_def_Cents\',\n                     \'First Name.2\', \'Last Name.2\', \'3p_Dollar\', \'3p_Cents\']\nNY_ASD_raw[\'state\'] = np.nan\nNY_ASD_raw[\'town\'] = np.nan\nNY_ASD_raw[\'occupation\'] = np.nan\nNY_ASD = NY_ASD_raw[[\'6p_Dollar\', \'6p_Cents\', \'6p_def_Dollar\', \'6p_def_Cents\',\'3p_Dollar\', \'3p_Cents\',\n                   \'town\', \'state\', \'occupation\']]\nASD_all = pd.concat([ASD_all, NY_ASD])\n'

### Pennsylvania

In [16]:
#prepare loan dataset
PA_CD_raw = pd.read_excel("../../Data/Post1790/PA/PA_post1790_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AJ, AK, AL, AM, AN, AO, AP')
PA_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
PA_CD, change_df = combineCols(PA_CD_raw)
CD_all = pd.concat([CD_all, PA_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
14,"{Carlisle Pennsylvania, Carlisle}",Carlisle Pennsylvania,town
39,"{Conecticutt, Connecticut}",Conecticutt,town
40,"{New Castle Delaware, Newcastle Delaware}",New Castle Delaware,town
113,"{Philadelphia County, Philadelphia}",Philadelphia County,town
131,"{State of Delawere, State of Delaware}",State of Delawere,town
...,...,...,...
1143,"{Esquire, Merchant}",Merchant,occupation
1181,"{Shop Keeper, Shopkeeper}",Shop Keeper,occupation
1200,"{in trust for the Estate of Robert Dill, Trust...",in trust for the Estate of Robert Dill,occupation
1224,"{Adm to the Est of William Barrell Decesed, Ad...",Administrator to the Estate of William Barrell...,occupation


### Rhode Island

In [17]:
#prepare loan dataset
RI_CD_raw = pd.read_excel("../../Data/Post1790/RI/T653_Rhode_Island_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
RI_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
RI_CD, change_df = combineCols(RI_CD_raw)
CD_all = pd.concat([CD_all, RI_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
reformatting occupation


Unnamed: 0,old,new,type
26,"{Gloucester, Glocester}",Gloucester,town
371,"{Chalestown, Charlestown}",Charlestown,town
435,"{North Kingstone, North Kingston}",North Kingstone,town
462,"{North Kingston, North Kingstown}",North Kingstown,town
499,"{Smithfeild, Smithfield}",Smithfeild,town


In [18]:
#prepare loan dataset
RI_ASD_raw = pd.read_excel("../../Data/Post1790/RI/T653_Rhode_Island_ASD.xlsx",
                          header = 11, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
RI_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
RI_ASD, change_df = combineCols(RI_ASD_raw)
ASD_all = pd.concat([CD_all, RI_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


### South Carolina

In [19]:
#prepare loan dataset
SC_CD_raw = pd.read_excel("../../Data/Post1790/SC/Post_1790_South_Carolina_CD.xlsx",
                      header = 11, usecols = 'D, E, F, G, H, M, N, S, T, U, V, W, AB, AC, AH, AI, AJ, AK, AL, AQ, AR')
SC_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
SC_CD, change_df = combineCols(SC_CD_raw)
CD_all = pd.concat([CD_all, SC_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
50,"{Camden Planter, Camden}",Camden Planter,town
94,"{Peedee, Pee Dee}",Pee Dee,town
270,"{Long Cane, Long Cames}",Long Cames,town
292,"{Charleston, Charlestom}",Charleston,town
45,"{as Executor to Henry Coram, as Executor to Jo...",as Executor to John Couturier,occupation
50,"{Planter as Executor Ely Kershaw, as Executor ...",Planter as Executor Ely Kershaw,occupation
182,"{Executor Philip Hawkins, Philip Hawkins}",Executor Philip Hawkins,occupation
256,"{Guardian to Mary Deborah L. Gowdey, as Guardi...",as Guardian to Mary Deborah L. Gowdey,occupation
305,"{Charleston Merchants, Merchants}",Charleston Merchants,occupation
383,"{Assignee of James Simons, Physician}",Assignee of James Simons,occupation


In [20]:
#prepare loan dataset
SC_ASD_raw = pd.read_excel("../../Data/Post1790/SC/Post_1790_South_Carolina_ASD_transfers_removed.xlsx", header = 11,
                       usecols = 'D, E, F, G, H, M, N, O')
SC_ASD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_def_Dollar','3p_Dollar']
for col in ['6p_', '6p_def_', '3p_']:
    SC_ASD_raw[col+'Cents'] = SC_ASD_raw[col+'Dollar'] - np.round(SC_ASD_raw[col+'Dollar'], 0)
    SC_ASD_raw[col+'Dollar'] = np.round(SC_ASD_raw['6p_Dollar'], 0)
SC_ASD = SC_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                     'town', 'state', 'occupation']]
ASD_all = pd.concat([ASD_all, SC_ASD])

### Virginia

In [21]:
"""
#prepare loan dataset
VA_CD_raw = pd.read_excel("../../Data/Post1790/VA/VA_CD.xlsx",
                      header = 11, usecols = 'H, I, K, L, U, V, X, Y, AH, AI, AK, AL')
VA_CD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
VA_CD_raw['state'] = np.nan
VA_CD_raw['town'] = np.nan
VA_CD_raw['occupation'] = np.nan
VA_CD = VA_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, VA_CD])
"""

'\n#prepare loan dataset\nVA_CD_raw = pd.read_excel("../../Data/Post1790/VA/VA_CD.xlsx",\n                      header = 11, usecols = \'H, I, K, L, U, V, X, Y, AH, AI, AK, AL\')\nVA_CD_raw.columns = [\'First Name\', \'Last Name\', \'6p_Dollar\', \'6p_Cents\',\n                 \'First Name.1\', \'Last Name.1\', \'6p_def_Dollar\', \'6p_def_Cents\',\n                 \'First Name.2\', \'Last Name.2\', \'3p_Dollar\', \'3p_Cents\']\nVA_CD_raw[\'state\'] = np.nan\nVA_CD_raw[\'town\'] = np.nan\nVA_CD_raw[\'occupation\'] = np.nan\nVA_CD = VA_CD_raw[[\'6p_Dollar\', \'6p_Cents\', \'6p_def_Dollar\', \'6p_def_Cents\', \'3p_Dollar\', \'3p_Cents\',\n                   \'town\', \'state\', \'occupation\']]\nCD_all = pd.concat([CD_all, VA_CD])\n'

In [22]:
#prepare loan dataset
VA_ASD_raw = pd.read_excel("../../Data/Post1790/VA/VA_ASD.xlsx", header = 11,
                       usecols = 'D, E, F, G, N, O, U, V, W, X, AE, AF, AL, AM, AN, AO, AW, AX')
VA_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'occupation1', '6p_Dollar', '6p_Cents',
                  'First Name.1', 'Last Name.1', 'town2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'town3', 'occupation3', '3p_Dollar', '3p_Cents']
VA_ASD_raw['state1'] = np.nan
VA_ASD_raw['state2'] = np.nan
VA_ASD_raw['state3'] = np.nan
VA_ASD, change_df = combineCols(VA_ASD_raw)
ASD_all = pd.concat([ASD_all, VA_ASD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
0,"{The Town Petersburg, Town of Petersburg}",The Town Petersburg,town
1,"{Richmond, City of Richmond}",City of Richmond,town
2,"{Dinwiddie Country, Dinwiddie County}",Dinwiddie Country,town
3,"{The Town Petersburg, Town of Petersburg, Town...",The Town Petersburg,town
4,"{Halifox County, Halifax County}",Halifox County,town
...,...,...,...
743,"{In trust for J Ball, In trust & co I Ball}",In trust & co I Ball,occupation
777,"{Trust for Geo Morrison, In trust for George M...",In trust for George Morrison London,occupation
831,"{In trust for Jn Robinson RB, In trust for Jn ...",In trust for Jn Robinson RB,occupation
875,"{Exec James Minor Louisa Co, Executors of Jame...",Exec James Minor Louisa Co,occupation


In [23]:
ASD_all

Unnamed: 0,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents,town,state,occupation
0,1064.0,75.0,532.0,37.0,508.0,51.0,Hartford,CT,Merchant
1,449.0,96.0,224.0,97.0,232.0,10.0,Bolton,CT,Farmer
2,154.0,20.0,77.0,10.0,192.0,,Rhode Island,RI,Farmer
3,196.0,75.0,98.0,37.0,172.0,24.0,Hartford,CT,Merchant
4,53.0,58.0,26.0,79.0,67.0,6.0,Hartford,CT,Merchant
...,...,...,...,...,...,...,...,...,...
903,,,,,28.0,29.0,Richmond,,
904,,,,,2030.0,35.0,New York,,
905,954.0,43.0,477.0,21.0,715.0,82.0,North Carolina,,Administrator of McKennie Sumner
906,,,,,,,,,


## Mapping Town/City to Counties - INCOMPLETE
1. Connecticut: Referencing <a href = "https://ctstatelibrary.org/cttowns/counties">https://ctstatelibrary.org/cttowns/counties</a> I found that Huntington is now called Shelton and Chatham is now called East Hampton. The other two cases below are not mappable because those are not valid town names.
2. Georgia: Investigate more, very few counties

In [24]:
# fuzzy string matching function
def fuzzyMatch(unmatched_towns, towns, crosswalk, primary_dict, dict_matchcol = 'primary_city', initial = True, score_threshold = 85):
    if initial:
        print("\nFuzzy City name - county matches\n")
    else:
        print("\nFuzzy City name - county matches with string changes\n")
    printedtowns = []
    for town in unmatched_towns:
        # extract best match
        match_tuple = process.extractOne(town, [x for x in crosswalk[dict_matchcol] if not pd.isnull(x)])
        score = match_tuple[1]
        match = match_tuple[0]
        # if match above threshold, change + print match so we can hand check
        if score >= score_threshold:
            if dict_matchcol == 'primary_city':
                county = primary_dict[match]
            if dict_matchcol == 'county':
                county = match
            # add match, print out match
            if initial:
                print("{} -> {} in {}".format(town, match, county))
                town_index = towns[towns['town'] == town].index
                towns.loc[town_index, 'county'] = county
            else:
                original_town = towns[towns['town2'] == town]['town'].tolist()
                if town not in printedtowns:
                    print("{} (new name: {}) -> {} in {}".format(original_town, town, match, county))
                    printedtowns.append(town)
                town_index = towns[towns['town'].apply(lambda x: x in original_town)].index
                towns.loc[town_index, 'county'] = [county] * len(town_index)
    return towns

In [25]:
def directTownMatch(state_cw, towns, col = 'primary_city', towncol = 'town'):
    print("Direct City name - county matches\n")
    # match towns directly based off crosswalk
    primary_dict = dict(zip(state_cw[col],state_cw['county']))
    if col == 'primary_city':
        towns['county'] = towns[towncol].apply(lambda x: primary_dict.get(x, np.nan))
    if col == 'acceptable_cities':
        for ind in towns.index:
            town = towns.loc[ind, 'town']
            county = state_cw[state_cw[col].apply(lambda x: town in x if not pd.isnull(x) else False)]['county'].tolist()
            if len(county)>0:
                towns.loc[ind, 'county'] = county[0]
    t = towns[towns['county'].apply(lambda x: not pd.isnull(x))]
    if towncol == 'town':
        for tn, cty in zip(t['town'], t['county']):
            print("{} was matched to {} directly using the crosswalk".format(tn, cty))
    if towncol == 'town2':
        for tn, tn_og, cty in zip(t['town2'], t['town'], t['county']):
            print("{} (original: {}) was matched to {} directly using the crosswalk".format(tn, tn_og, cty))
    return primary_dict, towns

In [26]:
def directCountyMatch(state_cw, towns, towncol = 'town'):
    print("\nSome city names are actually county names")
    if towncol == 'town':
        print("Direct City (county) name - county matches\n")
    if towncol == 'town2':
        print("Direct City (county) name with string changes - county matches\n")
    # some own names are actually counties
    # match towns based off whether town name is actually county name in crosswalk
    counties = state_cw['county'].unique()
    nanindex = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
    towns.loc[nanindex, 'county'] = towns.loc[nanindex, towncol].apply(lambda x: x if x in counties.tolist() else np.nan)
    towns2 = towns.loc[nanindex]
    nanindex2 = towns2[towns2['county'].apply(lambda x: not pd.isnull(x))].index
    for t, c in zip(towns2.loc[nanindex2, towncol], towns2.loc[nanindex2, 'county']):
        print("{} was matched to {} using the crosswalk".format(t, c))
    return towns

In [27]:
# change column of town dataframe's type to either town or county
def addType(towns, type = 'town'):
    towns['name_type'] = [name_type if not pd.isnull(name_type) else type if not pd.isnull(county) else np.nan for name_type, county in
                          zip(towns['name_type'], towns['county'])]
    return towns

In [28]:
city_county_cw = pd.read_excel('../../Data/zip_code_database.xls')[['primary_city', 'acceptable_cities',
                                                                    'unacceptable_cities', 'county', 'state']]

In [29]:
final_cw = pd.DataFrame(columns = ['town', 'county', 'state', 'name_type'])
list_of_states = ['CT', 'GA', 'MD', 'NC', 'NH', 'NJ', 'PA', 'RI', 'SC',
                  'MA', 'VA', 'DE']

for state in list_of_states:
    print("\n{} MATCHING \n".format(state))
    # create list of towns for each state
    towns = CD_all[CD_all['state'] == state][['town']].drop_duplicates()
    towns = towns[towns['town'].apply(lambda x: not pd.isnull(x))]
    # state crosswalk
    state_cw = city_county_cw[city_county_cw['state'] == state]
    if state == 'VA':
        state_cw = city_county_cw[city_county_cw['state'].apply(lambda x: x in ['VA', 'WV'])]
    state_cw = state_cw[state_cw['county'].apply(lambda x: 'county' in x.lower() if not pd.isnull(x) else False)]
    # try direct match: town name -> crosswalk town-county
    oldtowns = towns.copy()
    primary_dict, towns = directTownMatch(state_cw, towns, col = 'primary_city', towncol = 'town')
    # label name type
    towns['name_type'] = towns['county'].apply(lambda x: 'town' if not pd.isnull(x) else np.nan)

    if state == 'CT':
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town']
        towns = fuzzyMatch(unmatched_towns1, towns, state_cw, primary_dict, dict_matchcol = 'primary_city', initial = True, score_threshold = 85)
        towns = addType(towns)

        # modify town names - towns changed names (see CT note)
        # retry fuzzy match: town name -> crosswalk town-county
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Huntington', 'Shelton').replace('Chatham', 'East Hampton'))
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

    if state == 'GA':
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')
    if state == 'MD':
        # remove instances where Maryland is mentioned and unabbreviate county abbreviations
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Maryland', '').replace('Co ', 'County').strip())
        towns = directCountyMatch(state_cw, towns, towncol = 'town2')
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol = 'county', initial = False, score_threshold = 86)
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)


        # correct a matching - Baltimore City to Baltimore County
        print("Baltimore City changed to Baltimore County")
        towns['county'] = towns['county'].apply(lambda x: x.replace('City', 'County') if not pd.isnull(x) else x)
        towns = addType(towns)
    if state == 'NC':
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')
        # remove instances where North Carolina is mentioned and rename Tarborugh to enable matching
        towns['town2'] = towns['town'].apply(lambda x: x.replace('North Carolina', '').replace('Tarborugh', 'Tarboro').strip())
        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)
    if state == 'NH':
        # use acceptable_cities instead of primary_cities column to match in the crosswalk
        # try direct match: town name -> crosswalk town-county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town')
        towns.loc[null_ind] = tn
        towns = addType(towns)
        # remove instances where New Hampshire and other geo-jurisdictional terms are used
        # rename Rockingham to enable matching
        towns['town2'] = towns['town'].apply(lambda x: x.replace('State', '').replace('New Hampshire', '').replace('of ','').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Rockingham', 'Rockingham County').strip())
        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)
        # some "town" names are actually counties
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town2')
        towns = addType(towns, 'county')
        # manual fixes for matches
        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county in zip(['Brintwood', 'Portsmouth New Hampshire'],
                                ['Rockingham County', 'Rockingham County']):
            print("{} was matched to {}".format(town, county))
            if town == 'Brintwood':
                towns.loc[towns[towns['town'] == town].index, ['county', 'name_type']] = [county,'town]']
            else:
                towns.loc[towns[towns['town'] == town].index, ['county', 'name_type']] = [county,'county']
    if state == 'NJ':
        # remove instances where New Jersey is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('New Jersey', '').strip())

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)
        # some "town" names are actually counties
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town2')
        towns = addType(towns, 'county')
    if state == 'PA':
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')

        # use acceptable_cities instead of primary_cities column to match in the crosswalk
        # try direct match: town (county) name -> crosswalk county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town')
        towns.loc[null_ind] = tn
        towns = addType(towns, 'county')

        # remove instances where New Jersey is used, fix some notational issues
        # correct Dauphincoy to Dauphin and categorize Tulpehocken as being in Berks County
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Co ', 'County').replace('Delaware', 'Delaware County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Pennsylvania', '').replace('County County','County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Country', 'County').replace('Dauphincoy','Dauphin').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Tulpehocken', 'Berks County').strip())

        # categorize different Philadelphia neighborhoods as belonging in Philadelphia
        philreptowns = ['Blockley', 'Northan Liberties', 'Northern Liberties', \
                        'The Northern Libert', 'Passyunk', 'German Town', 'Southwark', 'Borden Town'] # not sure on this last one...
        for town in philreptowns:
            towns['town2'] = towns['town2'].apply(lambda x: x.replace(town, 'Philadelphia'))
        towns = addType(towns)
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = directCountyMatch(state_cw, towns, 'town2')
        towns = addType(towns, 'county')
        # some "town" names are actually counties
        # try direct match: town (county) name -> crosswalk county
        towns = directCountyMatch(state_cw, towns, towncol = 'town')
        towns = addType(towns, 'county')

        # use acceptable_cities instead of primary_cities column to match in the crosswalk
        # try direct match: town (county) name -> crosswalk county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town')
        towns.loc[null_ind] = tn
        towns = addType(towns, 'county')

        # remove instances where New Jersey is used, fix some notational issues
        # correct Dauphincoy to Dauphin and categorize Tulpehocken as being in Berks County
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Co ', 'County').replace('Delaware', 'Delaware County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Pennsylvania', '').replace('County County','County').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Country', 'County').replace('Dauphincoy','Dauphin').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Tulpehocken', 'Berks County').strip())

        # categorize different Philadelphia neighborhoods as belonging in Philadelphia
        philreptowns = ['Blockley', 'Northan Liberties', 'Northern Liberties',
                        'The Northern Libert', 'Passyunk', 'German Town', 'Southwark', 'Borden Town'] # not sure on this last one...
        for town in philreptowns:
            towns['town2'] = towns['town2'].apply(lambda x: x.replace(town, 'Philadelphia'))
        towns = addType(towns)
        # use modified town names
        # try direct match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = directCountyMatch(state_cw, towns, 'town2')
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = [x for x in towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2'] if x != '']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol = 'county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county, state in zip(['Charleston South Carolina', 'Burlington New Jersey', 'Northumberland County Virginia'],
                                       ['Charleston County', 'Burlington County', 'Northumberland County'],
                                       ['SC', 'NJ', 'VA']):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, ['county', 'state']] = [county, state]
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = [x for x in towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2'] if x != '']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol = 'county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county, state in zip(['Charleston South Carolina', 'Burlington New Jersey', 'Northumberland County Virginia'],
                                       ['Charleston County', 'Burlington County', 'Northumberland County'],
                                       ['SC', 'NJ', 'VA']):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, ['county', 'state']] = [county, state]
        towns = addType(towns, 'county')

    if state == 'RI':
        # remove instances where Rhode Island and other geo-jurisdictional terms are used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Rhode Island', '').replace('State ', '').replace('of', '').strip())
        # use modified town names
        # try fuzzy match: town name -> crosswalk town-county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns)

        # manually adjust incorrect matches
        print("\nManual Match\n")
        for town, county in zip(['Gloucester', 'Richmond'],
                                ['Providence County', 'Washington County']):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, 'county'] = county
        towns = addType(towns)

    if state == 'SC':
        # remove instances where South Carolina is used, change number to character
        towns['town2'] = towns['town'].apply(lambda x: x.replace('South Carolina', '').replace('96', 'Ninety six').strip())

        # use modified town names
        # use acceptable_cities column
        # try fuzzy match: town (county) name -> crosswalk county
        null_ind = towns[towns['county'].apply(lambda x: pd.isnull(x))].index
        pdict, tn = directTownMatch(state_cw, towns.loc[null_ind], col = 'acceptable_cities', towncol = 'town2')
        towns.loc[null_ind] = tn
        towns = addType(towns, 'county')

        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # some town names are actually county names
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol ='county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        townlist = ['St. Paul\'s', 'Pee Dee', 'St. George', 'New River', 'Winyaw', 'Broad River', \
                    'Toogoodoo', 'St Pauls', 'Savannah', \
                    'James Island', 'St Andrews'] # last two are manual fixes
        countylist = ['Clarendon County', 'Marion County', 'Dorchester County', 'Beaufort County', 'Georgetown County', 'Beaufort County',
                      'Charleston County', 'Clarendon County', 'Chatham County', \
                      'Charleston County', 'Richland County']
        for town, county in zip(townlist, countylist):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, 'county'] = county
        towns = addType(towns)

        towns.loc[towns[towns['town'] == 'Savannah'].index, 'state'] =  'GA'

    if state == 'MA':
        # remove instances where Massachusetts, MA or State is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('MA', '').replace('Massachusetts', '').replace('State','').strip())
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

    if state == 'VA':
        # remove instances where Massachusetts, MA or State is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('VA', '').replace('Virginia', '').replace('Virgina','').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('State', '').replace(' of ', '').strip())
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # some town names are actually county names
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol ='county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        townlist = ['Portsmouth Virginia'] # last two are manual fixes
        countylist = ['Norfolk County']
        for town, county in zip(townlist, countylist):
            print("{} was matched to {}".format(town, county))
            towns.loc[towns[towns['town'] == town].index, 'county'] = county
        towns = addType(towns)
    if state == 'DE':
        # remove instances where Massachusetts, MA or State is used
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Delaware', '').replace('State', '').replace(' of ', '').strip())
        towns['town2'] = towns['town2'].apply(lambda x: x.replace('Kent Company', 'Kent County').strip())
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2, towns, state_cw, primary_dict, dict_matchcol ='primary_city', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

        # some town names are actually county names
        # use modified town names
        # try fuzzy match: town (county) name -> crosswalk county
        unmatched_towns2_1 = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']
        towns = fuzzyMatch(unmatched_towns2_1, towns, state_cw, primary_dict, dict_matchcol ='county', initial = False, score_threshold = 85)
        towns = addType(towns, 'county')

    # print out all unmatched names
    print("\nFinal Unmatched Names\n")
    t = towns[towns['county'].apply(lambda x: pd.isnull(x))]
    for tn in t['town']:
        print("{} was unable to be matched".format(tn))

    towns = towns[towns['county'].apply(lambda x: not pd.isnull(x))]
    towns['state'] = state
    # only Georgia doesn't have a town2 column
    if state not in ['GA']:
        towns.drop('town2', axis = 1, inplace = True)

    # correct states for certain counties/cities in PA and SC
    if state == 'PA':
        towns.loc[towns[towns['town'] == 'Charleston South Carolina'].index, 'state'] ='SC'
        towns.loc[towns[towns['town'] == 'Northumberland County Virginia'].index, 'state'] = 'VA'
    if state == 'SC':
        towns.loc[towns[towns['town'] == 'Savannah'].index, 'state'] ='GA'


    final_cw = pd.concat([final_cw, towns])


CT MATCHING 

Direct City name - county matches

Hartford was matched to Hartford County directly using the crosswalk
Bolton was matched to Tolland County directly using the crosswalk
Wethersfield was matched to Hartford County directly using the crosswalk
New Haven was matched to New Haven County directly using the crosswalk
Farmington was matched to Hartford County directly using the crosswalk
New London was matched to New London County directly using the crosswalk
Cornwall was matched to Litchfield County directly using the crosswalk
Stamford was matched to Fairfield County directly using the crosswalk
East Hartford was matched to Hartford County directly using the crosswalk
Bristol was matched to Hartford County directly using the crosswalk
Lebanon was matched to New London County directly using the crosswalk
Windsor was matched to Hartford County directly using the crosswalk
Suffield was matched to Hartford County directly using the crosswalk
Berlin was matched to Hartford County

In [30]:
# drop duplicates that occur for some reason
final_cw.drop_duplicates(subset = ['town', 'state'], inplace = True)

In [31]:
# add labels for county and name type to CD_all
CD_all = pd.merge(CD_all, final_cw, on = ['town', 'state'], how = 'left')

In [32]:
# manually input county, state and name type labels
towns = ['Colchester', 'Charleston South Carolina', 'Philadelphia', 'Albany', 'Newark', 'Northumberland County Virginia', 'Savannah', 'City of New York', 'Long Island', 'Portsmouth Virginia']
counties = ['New London County', 'Berkeley County', 'Philadelphia County', 'Albany County', 'Essex County', 'Northumberland County', 'Chatham County', 'New York County', \
            np.nan, 'Rockingham County']
states = ['CT', 'SC', 'PA', 'NY', 'NJ', 'VA', 'GA', 'NY', 'NY', 'VA']
for town, county, state in zip(towns, counties, states):
    if town == 'Northumberland County Virginia':
        CD_all.loc[CD_all[CD_all['town'] == town].index, ['county', 'state', 'name_type']] = [county, state, 'county']
    elif pd.isnull(county):
        CD_all.loc[CD_all[CD_all['town'] == town].index, ['county', 'state', 'name_type']] = [county, state, 'other']
    else:
        CD_all.loc[CD_all[CD_all['town'] == town].index, ['county', 'state', 'name_type']] = [county, state, 'town']

In [33]:
# manually input county, state and name type labels
colonies = ['New Hampshire', 'Massachusetts', 'Rhode Island', 'Connecticut', 'Conecticutt', 'New York', 'New Jersey', 'Pennsylvania', 'Delaware', 'Maryland', \
            'Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Vermont', 'Delawere', 'Virgina']
abbrev = ['NH', 'MA', 'RI', 'CT', 'CT', 'NY', 'NJ', 'PA', 'DE', 'MD', 'VA', 'NC', 'SC', 'GA', 'VT', 'DE', 'VA']
subsetted_CD_all = CD_all[CD_all['county'].apply(lambda x: pd.isnull(x))]
for colony, abrv in zip(colonies, abbrev):
    selind = subsetted_CD_all[subsetted_CD_all['town'].apply(lambda x: colony in x if not pd.isnull(x) else False)].index
    CD_all.loc[selind, ['county', 'state', 'name_type']] = [np.nan, abrv, 'state']

In [34]:
# manual replacement of counties that changed names or boders
towns = ['Gilmantown', np.nan, 'Berlington New Jersey', np.nan, np.nan, np.nan, np.nan, np.nan, '96 District', \
         'Ninety six District', np.nan, np.nan, np.nan, np.nan, np.nan, 'Trenton New Jersey', 'Princeton New Jersey',
         'Newbury', 'Pembroke', 'Hopkinton', 'Salisbury', 'Canterbury', 'Concord', np.nan, np.nan, np.nan]
oldcounties = ['Belknap', 'Berkeley', 'Camden', 'Carroll', 'Columbia', 'Elk', 'Dorchester', 'Grant', 'Greenwood', \
               'Greenwood', 'Hampden', 'Kershaw', 'King and Queen', 'Lycoming', 'Marion', 'Mercer', 'Mercer',
               'Merrimack', 'Merrimack', 'Merrimack', 'Merrimack', 'Merrimack', 'Merrimack', 'Norfolk', 'Perry', 'Sullivan']
newcounties = ['Strafford', 'Charleston', 'Burlington', 'Strafford', 'Richmond', 'Northumberland', 'Charleston', 'Hampshire', 'Abbeville', \
               'Laurens', 'Hampshire', 'Lancaster', np.nan, 'Northumberland', 'Prince George\'s', 'Hunterdon', 'Middlesex',
               'Hillsborough', 'Hillsborough', 'Hillsborough', 'Hillsborough', 'Rockingham', 'Grafton', 'Suffolk', 'Cumberland', 'Cheshire']
for town, oldcounty, newcounty in zip(towns, oldcounties, newcounties):
    if not pd.isnull(town):
        ind = CD_all[[t == town and oldcounty + ' County' == oc for t, oc in zip(CD_all['town'], CD_all['county'])]].index
    else:
        ind = CD_all[CD_all['county'] == oldcounty + ' County'].index
    if pd.isnull(newcounty):
        CD_all.loc[ind, 'county']  = newcounty
    else:
        CD_all.loc[ind, 'county']  = newcounty + ' County'
    if len(ind) == 0:
        print(town, oldcounty)
#make corrections
ind = CD_all[[c == 'Charleston County' and s == 'MD' for c, s in zip(CD_all['county'], CD_all['state'])]].index
CD_all.loc[ind, 'county'] = 'Dorchester County'

ind = CD_all[[c == 'Prince George\'s County' and s == 'SC' for c, s in zip(CD_all['county'], CD_all['state'])]].index
CD_all.loc[ind, 'county'] = 'Georgetown County'

nan Elk
nan Lycoming
nan Perry


In [35]:
# manual fixes of assignments
CD_all.loc[CD_all[CD_all['county'] == 'Prince George\'s County'].index, 'county'] = 'Prince Georges County'
CD_all.loc[CD_all[CD_all['county'] == 'Queen Anne\'s County'].index, 'county'] = 'Queen Annes County'
CD_all.loc[CD_all[CD_all['county'] == 'St Mary\'s County'].index, 'county'] = 'St Marys County'

# more manual fixes
CD_all.loc[CD_all[CD_all['town'] == 'Doden Maryland'].index, ['county', 'name_type']] = ['Anne Arundel County', 'county']
CD_all.loc[CD_all[CD_all['town'] == 'Huntington New Jersey'].index, ['county', 'name_type']] = ['Hunterdon County', 'county']
# adding state labels
CD_all.loc[CD_all[[pd.isnull(t) and not pd.isnull(s) and s != 'FR' for t, s in zip(CD_all['town'], CD_all['state'])]].index, 'name_type'] = 'state'

CD_all.loc[CD_all[CD_all['town'] == 'Kittery'].index, ['county', 'name_type']] = ['York County', 'town']
CD_all.loc[CD_all[CD_all['town'] == 'Kensignton'].index, ['county', 'name_type']] = ['Philadelphia County', 'neighborhood']
CD_all.loc[CD_all[CD_all['town'] == 'York'].index, ['county', 'name_type']] = ['York County', 'town']
CD_all.loc[CD_all[CD_all['town'] == 'Wells'].index, ['county', 'name_type']] = ['York County', 'town']
CD_all.loc[CD_all[CD_all['town'] == 'James City County Virginia'].index, ['county']] = ['James City County']

CD_all.loc[CD_all[CD_all['town'] == 'Cumb County Pennsylvania'].index, ['county']] = ['Cumberland County']
CD_all.loc[CD_all[CD_all['town'] == 'Cumberland'].index, 'name_type'] = 'county'
CD_all.loc[CD_all[CD_all['town'] == 'york town pennsylvania'].index, ['county', 'name_type']] = ['Philadelphia County', 'town']

# Manual fixes of name_type
CD_all.loc[CD_all['town'] == 'Virginia and Philadelphia', 'name_type'] = 'state'
town_ind = CD_all[[nt == 'county' and ('County' not in c and 'Co ' not in c and 'Country' not in c and
                                       'Talbot' not in c and 'Rockingham' not in c and 'Delaware' not in c)
                       if not pd.isnull(c) else False for nt, c in zip(CD_all['name_type'], CD_all['town'])]].index
CD_all.loc[town_ind, 'name_type'] = 'town'

# some chester counties are mislabelled
chester_ind = CD_all[CD_all['town'].apply(lambda x: 'Chester' in x and 'Massachusetts' not in x if not pd.isnull(x) else False)].index
CD_all.loc[chester_ind, ['county', 'name_type']] = ['Chester County', 'county']

# correctly label some counties

county_ind = CD_all[[nt == 'town' and ('County of Philadelphia' in c or 'Paxton Tot Dauphin County' in c or 'Somerset County New Jersey' in c)
                     for nt, c in zip(CD_all['name_type'], CD_all['town'])]].index
CD_all.loc[county_ind, 'name_type'] = 'county'

In [36]:
CD_all['6p_total'] = CD_all['6p_Dollar'] + CD_all['6p_Cents']/100

In [37]:
CD_all.loc[CD_all[CD_all['name_type'].apply(lambda x: pd.isnull(x))].index, 'name_type'] = 'other'

In [38]:
grouped_assets = CD_all.groupby(['county', 'state']).agg({'6p_total': ['sum', 'count', 'mean']})
grouped_assets.columns = grouped_assets.columns.map('_'.join).str.strip('_')
grouped_assets.columns = ['6p_total', 'debtholder_county_count', 'mean_6p_held']
grouped_assets = grouped_assets.reset_index()

In [39]:
countyPop = pd.read_csv('../../Data/CensusData/countyPopulation.csv').drop(0).reset_index(drop = True)
sub_cols =['Area Name', 'State/US Abbreviation', 'Total Population', 'White Male', 'White Male Age 16 Years and over']
county_subset = countyPop[sub_cols]
merged_geography = pd.merge(county_subset, grouped_assets, left_on = ['Area Name', 'State/US Abbreviation'], right_on=['county', 'state'], how = 'right')
final_asset_data = merged_geography.drop(['Area Name', 'State/US Abbreviation'], axis = 1)
final_asset_data['debt_per_capita'] = final_asset_data['6p_total']/final_asset_data['Total Population'].apply(lambda x: float(x))
final_asset_data['debt_per_white_male'] = final_asset_data['6p_total']/final_asset_data['White Male'].apply(lambda x: float(x))

In [40]:
# for david to use when doing histograms of occupations vs no occupations and grouping occupations
CD_all.to_csv('../../Data/Post1790/aggregated_CD_noname.csv')
# for Jiacheng to use when making table
CD_all[['town', 'state', 'county', 'name_type']].drop_duplicates().to_csv('../../Data/AssetGeography/countymapping.csv')
# for Maria to use when making maps
final_asset_data.to_csv('../../Data/AssetGeography/county_debt_total.csv')

In [41]:
print(CD_all[CD_all['county'].apply(lambda x: not pd.isnull(x))]['6p_Dollar'].sum()/CD_all['6p_Dollar'].sum())
print(CD_all[CD_all['county'].apply(lambda x: not pd.isnull(x))].shape[0]/CD_all.shape[0])

0.6135230744005248
0.7534841215444368
