# Outline
1. Import continental and assumed debt for each state
2. Map towns to county names using mapping file + fuzzy string matching
3. Use county shape files to create maps

## Documentation Notes
1. All non-exact matches will be printed out when doing merges (exact matches are not printed)

## Cleaning Questions
1. What do we do about an entry that has entries for some debt types, but not for others? Do we want to drop those entries, impute the values, or set NA values to 0?
2. What do we do about entries that have a state, but no town (ie: State of Connecticut)? What about no state?

In [68]:
import numpy as np
import pandas as pd
from rapidfuzz import process

In [69]:
def combineCols(df, num = 3):
    change_df_agg = pd.DataFrame(columns = ['old', 'new', 'type'])
    for col in ['town', 'state', 'occupation']:
        # " ".join(x.split()) removes all excess whitespace
        # creates set with all unique instances of column
        if num == 3:
            df[col] = [set([" ".join(x.split()) for x in [t1, t2, t3] if not pd.isnull(x)]) for t1, t2, t3 in zip(df[col+'1'],
                                                                                                                  df[col+'2'],
                                                                                                                  df[col+'3'])]
        else:
            df[col] = [set([" ".join(x.split()) for x in [t1, t2] if not pd.isnull(x)]) for t1, t2 in zip(df[col+'1'],
                                                                                                          df[col+'2'])]
        if not any(df[col].apply(lambda x: len(x) > 1).tolist()):
            # change set to string
            print("reformatting {}".format(col))
            df[col] = df[col].apply(lambda x: x.pop() if x != set() else np.nan)
        else:
            print("{} column has multiple unique entries".format(col))
            print("see table at end for new entries")
            # keep the value that has the most characters, otherwise change set to string
            old = df[df[col].apply(lambda x: len(x) > 1)][col]
            df[col] = df[col].apply(lambda x: x.pop() if len(x) == 1 else np.nan if x == set() else max(list(x), key=len))
            # new dataframe to keep track of changes
            # create copy of change_df that removes all duplicates by turning old column, which is
            # of type set to type string
            change_df = pd.DataFrame([old, df.loc[old.index][col]]).T
            change_df.columns = ['old', 'new']
            change_df['type'] = col
            change_df_str = change_df.copy()
            change_df_str['old'] = change_df_str['old'].astype(str)
            change_df_str = change_df_str.drop_duplicates()
            # add filtered database of changes to aggregate dataset
            change_df_agg = pd.concat([change_df_agg, change_df.loc[change_df_str.index]])

    # add functions to combine asset totals, handle missing debt values

    return df[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
               'town', 'state', 'occupation']], change_df_agg

In [70]:
"""ind1, ind2, ind3 = CT_CD[['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index,
                   CT_CD[['6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index,
                   CT_CD[['3p_Dollar', '3p_Cents']].dropna(thresh = 1).index
ind = set(ind1).intersection(ind2).intersection(ind3)"""

"ind1, ind2, ind3 = CT_CD[['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index,\n                   CT_CD[['6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index,\n                   CT_CD[['3p_Dollar', '3p_Cents']].dropna(thresh = 1).index\nind = set(ind1).intersection(ind2).intersection(ind3)"

## Import Data
1. Import CD and ASD for each state, combine the multiple town/state/occupation columns (if they exist) into one
2. Concatenate all the separate datasets into two (one CD and one ASD)

### Connecticut

In [71]:
CD_all = pd.DataFrame(columns = ['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                                  'town', 'state', 'occupation'])
ASD_all = pd.DataFrame(columns = ['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                                  'town', 'state', 'occupation'])

In [72]:
# importing desired columns and rename
CT_CD_raw = pd.read_excel("../../Data/Post1790/CT/CT_post1790_CD_ledger.xlsx",
                      header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_CD, change_df = combineCols(CT_CD_raw)
CD_all = pd.concat([CD_all, CT_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
reformatting occupation


Unnamed: 0,old,new,type
526,"{Milford, Miford}",Milford,town
799,"{Nowich, Norwich}",Norwich,town


In [73]:
# importing desired columns and rename
CT_ASD_raw = pd.read_excel("../../Data/Post1790/CT/CT_post1790_ASD_ledger.xlsx",
                       header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_ASD, change_df = combineCols(CT_ASD_raw)
ASD_all = pd.concat([ASD_all, CT_ASD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
125,"{Norwall, Norwalk}",Norwall,town
811,"{1st Society in Lyme, Treasuer 1st Society in ...",Treasuer 1st Society in Lyme,occupation


### Georgia
No need to do additional cleaning because there's only one state/city/occupation column

In [74]:
# importing desired columns and rename
# prepare loan dataset
GA_CD_raw = pd.read_excel("../../Data/Post1790/GA/T694_GA_Loan_Office_CD.xlsx",
                      header = 10, usecols = 'Q, R, S, T, U, Z, AA, AB, AC, AD, AE')
GA_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
GA_CD = GA_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, GA_CD])

### Maryland

In [75]:
#prepare loan dataset
MD_CD_raw = pd.read_excel("../../Data/Post1790/MD/MD_post1790_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
MD_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
MD_CD , change_df = combineCols(MD_CD_raw)
CD_all = pd.concat([CD_all, MD_CD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


In [76]:
#prepare loan dataset
MD_ASD_raw = pd.read_excel("../../Data/Post1790/MD/MD_post1790_ASD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
MD_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
MD_ASD , change_df = combineCols(MD_ASD_raw)
ASD_all = pd.concat([ASD_all, MD_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


### North Carolina

In [77]:
NC_CD_raw = pd.read_excel("../../Data/Post1790/NC/T695_R4_NC_CD.xlsx",
                      header = 11, usecols = 'J, K, L, M, N, W, X, Z, AA, AC, AD ')
NC_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_CD = NC_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, NC_CD])

In [78]:
NC_ASD_raw = pd.read_excel("../../Data/Post1790/NC/T695_R3_NC_ASD.xlsx", header = 10, usecols = 'H, I, J, K, L, P, Q, R, S, T, U')
NC_ASD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation',  '6p_Dollar', '6p_Cents',
                      '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_ASD = NC_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                         'town', 'state', 'occupation']]
ASD_all = pd.concat([ASD_all, NC_ASD])

### New Hampshire

In [79]:
#prepare loan dataset
NH_CD_raw = pd.read_excel("../../Data/Post1790/NH/T652_R6_New_Hampshire_CD.xlsx",
                      header = 10, usecols = 'I, J, K, L, M, N, O, P, Q, R, S')
NH_CD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_Cents',
                     '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NH_CD = NH_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, NH_CD])

In [80]:
NH_ASD_raw = pd.read_excel("../../Data/Post1790/NH/T652_New_Hampshire_ASD.xlsx", header = 12,
                       usecols = 'G, H, I, J, K, M, N, V, W, X, Y, Z, AA, AB, AK, AL, AM, AN')
NH_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                      'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                      'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
# unify occupation, town and state columns
NH_ASD, change_df = combineCols(NH_ASD_raw, 2)
ASD_all = pd.concat([ASD_all, NH_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


### New York
Doesn't have town/occupation/state

In [81]:
"""#prepare loan dataset
NY_CD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_CD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_CD_raw['state'] = np.nan
NY_CD_raw['town'] = np.nan
NY_CD_raw['occupation'] = np.nan
NY_CD = NY_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, NY_CD])
"""

'#prepare loan dataset\nNY_CD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",\n                      header = 11, usecols = \'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS\')\nNY_CD_raw.columns = [\'First Name\', \'Last Name\', \'6p_Dollar\', \'6p_Cents\',\n                 \'First Name.1\', \'Last Name.1\', \'6p_def_Dollar\', \'6p_def_Cents\',\n                 \'First Name.2\', \'Last Name.2\', \'3p_Dollar\', \'3p_Cents\']\nNY_CD_raw[\'state\'] = np.nan\nNY_CD_raw[\'town\'] = np.nan\nNY_CD_raw[\'occupation\'] = np.nan\nNY_CD = NY_CD_raw[[\'6p_Dollar\', \'6p_Cents\', \'6p_def_Dollar\', \'6p_def_Cents\',\'3p_Dollar\', \'3p_Cents\',\n                   \'town\', \'state\', \'occupation\']]\nCD_all = pd.concat([CD_all, NY_CD])\n'

In [82]:
"""#prepare loan dataset
NY_ASD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",
                          header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_ASD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_ASD_raw['state'] = np.nan
NY_ASD_raw['town'] = np.nan
NY_ASD_raw['occupation'] = np.nan
NY_ASD = NY_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
ASD_all = pd.concat([ASD_all, NY_ASD])
"""

'#prepare loan dataset\nNY_ASD_raw = pd.read_excel("../../Data/Post1790/NY/NY_1790_CD.xlsx",\n                          header = 11, usecols = \'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS\')\nNY_ASD_raw.columns = [\'First Name\', \'Last Name\', \'6p_Dollar\', \'6p_Cents\',\n                     \'First Name.1\', \'Last Name.1\', \'6p_def_Dollar\', \'6p_def_Cents\',\n                     \'First Name.2\', \'Last Name.2\', \'3p_Dollar\', \'3p_Cents\']\nNY_ASD_raw[\'state\'] = np.nan\nNY_ASD_raw[\'town\'] = np.nan\nNY_ASD_raw[\'occupation\'] = np.nan\nNY_ASD = NY_ASD_raw[[\'6p_Dollar\', \'6p_Cents\', \'6p_def_Dollar\', \'6p_def_Cents\',\'3p_Dollar\', \'3p_Cents\',\n                   \'town\', \'state\', \'occupation\']]\nASD_all = pd.concat([ASD_all, NY_ASD])\n'

### Pennsylvania

In [83]:
#prepare loan dataset
PA_CD_raw = pd.read_excel("../../Data/Post1790/PA/PA_post1790_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AJ, AK, AL, AM, AN, AO, AP')
PA_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
PA_CD, change_df = combineCols(PA_CD_raw)
CD_all = pd.concat([CD_all, PA_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
14,"{Carlisle, Carlisle Pennsylvania}",Carlisle Pennsylvania,town
39,"{Conecticutt, Connecticut}",Conecticutt,town
40,"{New Castle Delaware, Newcastle Delaware}",New Castle Delaware,town
113,"{Philadelphia County, Philadelphia}",Philadelphia County,town
131,"{State of Delawere, State of Delaware}",State of Delawere,town
...,...,...,...
1143,"{Merchant, Esquire}",Merchant,occupation
1181,"{Shop Keeper, Shopkeeper}",Shop Keeper,occupation
1200,"{Trust for the Estate of Robert Dill, in trust...",in trust for the Estate of Robert Dill,occupation
1224,"{Adm to the Est of William Barrell Decesed, Ad...",Administrator to the Estate of William Barrell...,occupation


### Rhode Island

In [84]:
#prepare loan dataset
RI_CD_raw = pd.read_excel("../../Data/Post1790/RI/T653_Rhode_Island_CD.xlsx",
                      header = 11, usecols = 'G, H, I, J, K, L, M, U, V, W, X, Y, Z, AA, AI, AJ, AK, AL, AM, AN, AO')
RI_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
RI_CD, change_df = combineCols(RI_CD_raw)
CD_all = pd.concat([CD_all, RI_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
reformatting occupation


Unnamed: 0,old,new,type
26,"{Glocester, Gloucester}",Gloucester,town
371,"{Chalestown, Charlestown}",Charlestown,town
435,"{North Kingstone, North Kingston}",North Kingstone,town
462,"{North Kingstown, North Kingston}",North Kingstown,town
499,"{Smithfield, Smithfeild}",Smithfield,town


In [85]:
#prepare loan dataset
RI_ASD_raw = pd.read_excel("../../Data/Post1790/RI/T653_Rhode_Island_ASD.xlsx",
                          header = 11, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
RI_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
RI_ASD, change_df = combineCols(RI_ASD_raw)
ASD_all = pd.concat([CD_all, RI_ASD])
change_df

reformatting town
reformatting state
reformatting occupation


Unnamed: 0,old,new,type


### South Carolina

In [86]:
#prepare loan dataset
SC_CD_raw = pd.read_excel("../../Data/Post1790/SC/Post_1790_South_Carolina_CD.xlsx",
                      header = 11, usecols = 'D, E, F, G, H, M, N, S, T, U, V, W, AB, AC, AH, AI, AJ, AK, AL, AQ, AR')
SC_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
SC_CD, change_df = combineCols(SC_CD_raw)
CD_all = pd.concat([CD_all, SC_CD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
50,"{Camden, Camden Planter}",Camden Planter,town
94,"{Peedee, Pee Dee}",Pee Dee,town
270,"{Long Cames, Long Cane}",Long Cames,town
292,"{Charleston, Charlestom}",Charleston,town
45,"{as Executor to Henry Coram, as Executor to Jo...",as Executor to John Couturier,occupation
50,"{as Executor Ely Kershaw, Planter as Executor ...",Planter as Executor Ely Kershaw,occupation
182,"{Executor Philip Hawkins, Philip Hawkins}",Executor Philip Hawkins,occupation
256,"{as Guardian to Mary Deborah L. Gowdey, Guardi...",as Guardian to Mary Deborah L. Gowdey,occupation
305,"{Merchants, Charleston Merchants}",Charleston Merchants,occupation
383,"{Physician, Assignee of James Simons}",Assignee of James Simons,occupation


In [87]:
#prepare loan dataset
SC_ASD_raw = pd.read_excel("../../Data/Post1790/SC/Post_1790_South_Carolina_ASD_transfers_removed.xlsx", header = 11,
                       usecols = 'D, E, F, G, H, M, N, O')
SC_ASD_raw.columns = ['First Name', 'Last Name', 'town', 'state', 'occupation', '6p_Dollar', '6p_def_Dollar','3p_Dollar']
for col in ['6p_', '6p_def_', '3p_']:
    SC_ASD_raw[col+'Cents'] = SC_ASD_raw[col+'Dollar'] - np.round(SC_ASD_raw[col+'Dollar'], 0)
    SC_ASD_raw[col+'Dollar'] = np.round(SC_ASD_raw['6p_Dollar'], 0)
SC_ASD = SC_ASD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                     'town', 'state', 'occupation']]
ASD_all = pd.concat([ASD_all, SC_ASD])

### Virginia

In [88]:
"""
#prepare loan dataset
VA_CD_raw = pd.read_excel("../../Data/Post1790/VA/VA_CD.xlsx",
                      header = 11, usecols = 'H, I, K, L, U, V, X, Y, AH, AI, AK, AL')
VA_CD_raw.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents',
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
VA_CD_raw['state'] = np.nan
VA_CD_raw['town'] = np.nan
VA_CD_raw['occupation'] = np.nan
VA_CD = VA_CD_raw[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents',
                   'town', 'state', 'occupation']]
CD_all = pd.concat([CD_all, VA_CD])
"""

'\n#prepare loan dataset\nVA_CD_raw = pd.read_excel("../../Data/Post1790/VA/VA_CD.xlsx",\n                      header = 11, usecols = \'H, I, K, L, U, V, X, Y, AH, AI, AK, AL\')\nVA_CD_raw.columns = [\'First Name\', \'Last Name\', \'6p_Dollar\', \'6p_Cents\',\n                 \'First Name.1\', \'Last Name.1\', \'6p_def_Dollar\', \'6p_def_Cents\',\n                 \'First Name.2\', \'Last Name.2\', \'3p_Dollar\', \'3p_Cents\']\nVA_CD_raw[\'state\'] = np.nan\nVA_CD_raw[\'town\'] = np.nan\nVA_CD_raw[\'occupation\'] = np.nan\nVA_CD = VA_CD_raw[[\'6p_Dollar\', \'6p_Cents\', \'6p_def_Dollar\', \'6p_def_Cents\', \'3p_Dollar\', \'3p_Cents\',\n                   \'town\', \'state\', \'occupation\']]\nCD_all = pd.concat([CD_all, VA_CD])\n'

In [89]:
#prepare loan dataset
VA_ASD_raw = pd.read_excel("../../Data/Post1790/VA/VA_ASD.xlsx", header = 11,
                       usecols = 'D, E, F, G, N, O, U, V, W, X, AE, AF, AL, AM, AN, AO, AW, AX')
VA_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'occupation1', '6p_Dollar', '6p_Cents',
                  'First Name.1', 'Last Name.1', 'town2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'town3', 'occupation3', '3p_Dollar', '3p_Cents']
VA_ASD_raw['state1'] = np.nan
VA_ASD_raw['state2'] = np.nan
VA_ASD_raw['state3'] = np.nan
VA_ASD, change_df = combineCols(VA_ASD_raw)
ASD_all = pd.concat([ASD_all, VA_ASD])
change_df

town column has multiple unique entries
see table at end for new entries
reformatting state
occupation column has multiple unique entries
see table at end for new entries


Unnamed: 0,old,new,type
0,"{Town of Petersburg, The Town Petersburg}",The Town Petersburg,town
1,"{Richmond, City of Richmond}",City of Richmond,town
2,"{Dinwiddie Country, Dinwiddie County}",Dinwiddie Country,town
3,"{Town of Petersbung, Town of Petersburg, The T...",The Town Petersburg,town
4,"{Halifax County, Halifox County}",Halifax County,town
...,...,...,...
743,"{In trust & co I Ball, In trust for J Ball}",In trust & co I Ball,occupation
777,"{Trust for Geo Morrison, In trust for George M...",In trust for George Morrison London,occupation
831,"{In trust for Jn Robinson, In trust for Jn Rob...",In trust for Jn Robinson RB,occupation
875,"{Executors James Minor, Exec James Minor Louis...",Exec James Minor Louisa Co,occupation


## Mapping Town/City to Counties - INCOMPLETE
1. Connecticut: Referencing <a href = "https://ctstatelibrary.org/cttowns/counties">https://ctstatelibrary.org/cttowns/counties</a> I found that Huntington is now called Shelton and Chatham is now called East Hampton. The other two cases below are not mappable because those are not valid town names.

In [250]:
final_cw = pd.DataFrame(columns = ['town', 'county', 'state'])
for state in list_of_states:
    # create list of towns for each state
    towns = CT_CD[CT_CD['state'] == state][['town']].drop_duplicates()
    towns = towns[towns['town'].apply(lambda x: not pd.isnull(x))]
    # state crosswalk
    state_cw = city_county_cw[city_county_cw['state'] == state]

    # try initial merge, add counties
    primary_dict = dict(zip(state_cw['primary_city'],state_cw['county']))
    towns['county'] = towns['town'].apply(lambda x: primary_dict.get(x, np.nan))

    # try fuzzy string match merge
    towns = fuzzyTownMatch(towns, state_cw)

    if state == 'CT':
        # modify towns that changed names - see note at very top
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Huntington', 'Shelton').replace('Chatham', 'East Hampton'))
        towns[towns['county'].apply(lambda x: pd.isnull(x))]

    towns = fuzzyTownMatch(towns, state_cw, False)
    towns = towns[towns['county'].apply(lambda x: not pd.isnull(x))]
    towns['state'] = state
    final_cw = pd.concat([final_cw, towns.drop('town2', axis = 1)])

NameError: name 'list_of_states' is not defined

In [266]:
CT_CD['debt_total'] = CT_CD['6p_Dollar'].fillna(0) + CT_CD['6p_Cents'].fillna(0)
CT_CD.groupby('county')['debt_total'].sum().to_csv('../../Data/CT_debt_temp.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CT_CD['debt_total'] = CT_CD['6p_Dollar'].fillna(0) + CT_CD['6p_Cents'].fillna(0)
