<h3> Data Cleaning </h3>

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import numpy as np
import pandas as pd
from rapidfuzz import process
from os.path import exists
import itertools

In [6]:
def stringConvert(x):
    return x.replace("  ", " ") if type(x) == str else ""

In [7]:
def combineLists(lst):
    returnlst = []
    for sublist in lst:
        if type(sublist) == list:
            returnlst.extend([item for item in sublist])
        else:
            returnlst.append(sublist)
    return returnlst

In [17]:
#function that makes dictionary that combines 3 full name columns into 1
def genFullNameList(namelst): 
    #remove duplicates and nulls
    namelst = list(set([name for name in namelst if not pd.isnull(name)]))
    namelst = sorted(namelst, key=len)
    #remove names that are really similar
    namelstnew = namelst
    if len(namelst) > 1:
        namelstnew = []
        name1 = namelst[0]
        namelstnew.append(name1)
        for name in namelst[1:]:
            score1  = process.extract(name1, [name])[0][1]
            #only add names if they are dissimilar - fuzzy score 70 or less
            if score1 <= 70:
                namelstnew.append(name)
    return namelstnew

In [9]:
def genFuzzyDict(df):
    namelst = list(set([name for namelist in df['full name prelim'] for name in namelist]))
    #create dictionary that matches similar names together
    fn_fuzzy_pre = dict()
    for name in namelst:
        marker = False
        if not pd.isnull(name):
            #find matches for name
            match = process.extract(name, [x for x in namelst if x != name and not 
                                           pd.isnull(x)], limit = 1, score_cutoff = 90)
            if len(match)> 0:
                match = match[0]
                if match[1]>95:
                    #add suitable matches to dictionary
                    for nm in [match[0], name]:
                        if nm in fn_fuzzy_pre.keys() and not marker:
                            fn_fuzzy_pre[nm].extend([n for n in [match[0], name] if 
                                                     n != nm and n not in fn_fuzzy_pre[nm]])
                            marker = True
                    if not marker:
                        if len(name) < len(match[0]):
                            fn_fuzzy_pre[name] = [match[0]]
                        else:
                            fn_fuzzy_pre[match[0]] = [name]
    #invert dictionary
    fn_fuzzy = dict()
    for key in fn_fuzzy_pre.keys():
        vals = fn_fuzzy_pre[key]
        for val in vals:
            fn_fuzzy[val] = key
    
    return fn_fuzzy

In [10]:
#separate a string that contains two names into a list of two names
def parseNames(x):
    #replace words that don't have meaning
    x = x.replace("and Co", "").replace("and co", "").replace("and Others" ,"")
    x = x.replace("and others", "").replace("and Son", "").replace("and Sons", "")
    x = x.replace("and Brothers", "").strip()
    #string preprocessing
    namelst = x.split(" and ")
    namelst = [x.strip() for x in namelst if x.strip() != ""]
    if len(namelst) > 1:
        wd1len = len(namelst[0].split(" "))
        wd2len = len(namelst[1].split(" "))
        #add last name
        if wd1len == 1 and wd2len != 1:
            namelst[0] = namelst[0] + " " + namelst[1].split(" ")[-1]
    return namelst

In [11]:
def transformdf(df, state):
    #add full name columns
    df['full name 1'] = (df['First Name'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name'].apply(lambda x: stringConvert(x)))
    df['full name 1'] = df['full name 1'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['full name 2'] = (df['First Name.1'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name.1'].apply(lambda x: stringConvert(x)))
    df['full name 2'] = df['full name 2'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['full name 3'] = (df['First Name.2'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name.2'].apply(lambda x: stringConvert(x)))
    df['full name 3'] = df['full name 3'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['debt state'] = state
    #add dicionary to merge different full name columns into one
    df['full name prelim'] = [ ([fname1, fname2, fname3])
                              for fname1, fname2, fname3 in zip(df['full name 1'],
                                                                df['full name 2'],
                                                                df['full name 3'])]
    df['full name'] = df['full name prelim']
    #do some additional preprocessing
    df = df[df['full name'].apply(lambda x: x != [])]
    #separate names that are combined with "and", or otherwise treatde as one when they should be two
    df['full name'] = df['full name'].apply(lambda lst: [parseNames(x) if len(x.strip().split(" ")) > 2 and " and " in x 
                                                                 else x for x in lst])
    df['full name'] = df['full name'].apply(lambda namelist: combineLists(namelist))
    
    #fuzzy matching for different names in the full name column
    fn_fuzzy = genFuzzyDict(df)
    df['full name'] = df['full name'].apply(lambda lst: [x if x not in fn_fuzzy.keys() 
                                                         else fn_fuzzy[x] for x in lst])
    #simplify state into just one column
    if 'state1' in list(df.columns):
        state_list = [[s for s in list(set([s1, s2, s3])) if not pd.isnull(s)]
                      for s1, s2, s3 in zip(df['state1'], df['state2'], df['state3'])]
        df['state']  = [state[0] if state != [] else np.nan for state in state_list]
   
    #fill in potentially missing states
    missing_fullname = list(df[df['state'].apply(lambda x: pd.isnull(x))]['full name'])
    missing_ind = list(df[df['state'].apply(lambda x: pd.isnull(x))].index)
    replacement_states = []
    for name in missing_fullname:
        df_filt = df[df['full name'].apply(lambda x: x == name)] 
        if df_filt.shape[1] != 0:
            state = list(df_filt['state'])[0]
            replacement_states.append(state)
        else:
            replacement_states.append(np.nan)
    df.loc[missing_ind, 'state'] = replacement_states
    
    return df

In [12]:
def transformonecoldf(df, state):
    #do transformdf but for when you only have one full name oclumn
    df['full name prelim'] =  (df['First Name'].apply(lambda x: stringConvert(x)) + " " + 
                               df['Last Name'].apply(lambda x: stringConvert(x))).apply(lambda x: x.strip())
    
    df['full name prelim'] = df['full name prelim'].apply(lambda x: [x] if x != "" else [])
    df['debt state'] = state
    
    df['full name'] = df['full name prelim']
    #some preprocessing
    df = df[df['full name'].apply(lambda x: x != [])]
    #separate names that are combined with "and", or otherwise treated as one when they should be two
    df['full name'] = df['full name'].apply(lambda lst: [parseNames(x) if len(x.strip().split(" ")) > 2 and " and " in x 
                                                                 else x for x in lst])
    df['full name'] = df['full name'].apply(lambda namelist: combineLists(namelist))
    
    #fuzzy matching for different names in the full name column
    fn_fuzzy = genFuzzyDict(df)
    df['full name'] = df['full name'].apply(lambda lst: [x if x not in fn_fuzzy.keys() 
                                                         else fn_fuzzy[x] for x in lst]) 
    
    #fill in potentially missing states
    if list(set(df['state'])) != [np.nan]:
        missing_fullname = list(df[df['state'].apply(lambda x: pd.isnull(x))]['full name'])
        missing_ind = list(df[df['state'].apply(lambda x: pd.isnull(x))].index)
        replacement_states = []
        for name in missing_fullname:
            df_filt = df[df['full name'].apply(lambda x: x == name)] 
            if df_filt.shape[1] != 0:
                state = list(df_filt['state'])[0]
                replacement_states.append(state)
            else:
                replacement_states.append(np.nan)
        df.loc[missing_ind, 'state'] = replacement_states
        
    return df

# Connecticut Continental Debt Dataset Matching

In [18]:
#prepare loan dataset
CT_CD = pd.read_excel("../Data/Post1790/CT/CT_post1790_CD_ledger.xlsx", 
                      header = 13, usecols = 'H, I, K, N, O, X, Y, AA, AD, AE, AN, AO, AQ, AT, AU')
CT_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents', ]
CT_CD_agg_pre = transformdf(CT_CD, 'CT')
CT_CD_agg = CT_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [19]:
#prepare loan dataset
CT_ASD = pd.read_excel("../Data/Post1790/CT/CT_post1790_ASD_ledger.xlsx", 
                      header = 13, usecols = 'H, I, K, N, O, X, Y, AA, AD, AE, AN, AO, AQ, AT, AU')
CT_ASD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents', ]
CT_ASD_agg_pre = transformdf(CT_ASD, 'CT')
CT_ASD_agg = CT_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

# Maryland Continental Debt Dataset Matching

In [20]:
#prepare loan dataset
MD_CD = pd.read_excel("../Data/Post1790/MD/MD_post1790_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AL, AN, AO')
MD_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
MD_CD_agg_pre = transformdf(MD_CD, 'MD')
MD_CD_agg = MD_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [21]:
#prepare loan dataset
MD_ASD = pd.read_excel("../Data/Post1790/MD/MD_post1790_ASD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AL, AN, AO')
MD_ASD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
MD_ASD_agg_pre = transformdf(MD_ASD, 'MD')
MD_ASD_agg = MD_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [22]:
cumulative_CD = pd.concat([CT_CD_agg, MD_CD_agg])

In [23]:
cumulative_ASD = pd.concat([CT_ASD_agg, MD_ASD_agg])

# North Carolina Continental Debt Dataset Matching

In [24]:
#prepare loan dataset
NC_CD = pd.read_excel("../Data/Post1790/NC/T695_R4_NC_CD.xlsx", 
                      header = 11, usecols = 'J, K, M, W, X, Z, AA, AC, AD ')
NC_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_CD_agg_pre = transformonecoldf(NC_CD, 'NC')
NC_CD_agg = NC_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [25]:
#prepare loan dataset
NC_ASD = pd.read_excel("../Data/Post1790/NC/T695_R3_NC_ASD.xlsx", 
                      header = 11, usecols = 'H, I, K, P, Q, R, S, T, U ')
NC_ASD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_ASD_agg_pre = transformonecoldf(NC_ASD, 'NC')
NC_ASD_agg = NC_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [26]:
cumulative_CD = pd.concat([NC_CD_agg, cumulative_CD])

In [27]:
cumulative_ASD = pd.concat([NC_ASD_agg, cumulative_ASD])

# New Hampshire Continental Debt Dataset Matching

In [28]:
#prepare loan dataset
NH_CD = pd.read_excel("../Data/Post1790/NH/T652_R6_New_Hampshire_CD.xlsx", 
                      header = 10, usecols = 'I, J, L, N, O, P, Q, R, S')
NH_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents',  
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NH_CD_agg_pre = transformonecoldf(NH_CD, 'NH')
NH_CD_agg = NH_CD_agg_pre[['full name', 'state', 'debt state',  '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [29]:
NH_ASD = pd.read_excel("../Data/Post1790/NH/T652_New_Hampshire_ASD.xlsx", 
                      header = 11, usecols = 'G, H, J, M, N, V, W, AB, AC, AK, AL, AM, AN')
NH_ASD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NH_ASD_agg_pre = transformdf(NH_ASD, 'NH')
NH_ASD_agg = NH_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                             '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [30]:
cumulative_CD = pd.concat([NH_CD_agg, cumulative_CD])

In [31]:
cumulative_ASD = pd.concat([NC_ASD_agg, cumulative_ASD])

# New York Continental Debt Dataset Matching

In [32]:
#new york data doesn't tell us what state people are from

In [33]:
#prepare loan dataset
NY_CD = pd.read_excel("../Data/Post1790/NY/NY_1790_CD.xlsx", 
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_CD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_CD['state'] = 'NY loan office'
NY_CD_agg_pre = transformdf(NY_CD, 'NY')
NY_CD_agg = NY_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [34]:
#prepare loan dataset
NY_ASD = pd.read_excel("../Data/Post1790/NY/NY_1790_ASD.xlsx", 
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_ASD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_ASD['state'] = 'NY loan office'
NY_ASD_agg_pre = transformdf(NY_ASD, 'NY')
NY_ASD_agg = NY_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [35]:
cumulative_CD = pd.concat([NY_CD_agg, cumulative_CD])

In [36]:
cumulative_ASD = pd.concat([NY_ASD_agg, cumulative_ASD])

# South Carolina Continental Debt Dataset Matching

In [37]:
#prepare loan dataset
SC_CD = pd.read_excel("../Data/Post1790/SC/Post_1790_South_Carolina_CD.xlsx", 
                      header = 11, usecols = 'D, E, G, M, N, S, T, V, AB, AC, AH, AI, AK, AQ, AR')
SC_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2',  '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
SC_CD_agg_pre = transformdf(SC_CD, 'SC')
SC_CD_agg = SC_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [38]:
#prepare loan dataset
SC_ASD = pd.read_excel("../Data/Post1790/SC/Post_1790_South_Carolina_ASD_transfers_removed.xlsx", 
                      header = 11, usecols = 'D, E, G, M, N, O')
SC_ASD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_def_Dollar', '3p_Dollar']
SC_ASD_agg_pre = transformonecoldf(SC_ASD, 'SC')
SC_ASD_agg = SC_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_def_Dollar', '3p_Dollar', ]]

In [39]:
cumulative_CD = pd.concat([SC_CD_agg, cumulative_CD])

In [40]:
cumulative_ASD = pd.concat([SC_ASD_agg, cumulative_ASD])

# Pennsylvania Continental Debt Dataset Matching

In [41]:
#prepare loan dataset
PA_CD = pd.read_excel("../Data/Post1790/PA/PA_post1790_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AM, AO, AP')
PA_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
PA_CD_agg_pre = transformdf(PA_CD, 'PA')
PA_CD_agg = PA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [42]:
cumulative_CD = pd.concat([PA_CD_agg, cumulative_CD])

# Rhode Island Continental Debt Dataset Matching

In [43]:
#prepare loan dataset
RI_CD = pd.read_excel("../Data/Post1790/RI/T653_Rhode_Island_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AL, AN, AO')
RI_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
RI_CD_agg_pre = transformdf(RI_CD, 'RI')
RI_CD_agg = RI_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [44]:
#prepare loan dataset
RI_ASD = pd.read_excel("../Data/Post1790/RI/T653_Rhode_Island_ASD.xlsx", 
                      header = 11, usecols = 'H, I, K, N, O, X, Y, AA, AD, AE, AO, AP, AQ, AT, AU')
RI_ASD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                  'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
RI_ASD_agg_pre = transformdf(RI_ASD, 'RI')
RI_ASD_agg = RI_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [45]:
cumulative_CD = pd.concat([RI_CD_agg, cumulative_CD])

In [46]:
cumulative_ASD = pd.concat([RI_ASD_agg, cumulative_ASD])

# Virginia Continental Debt Dataset Matching

In [47]:
#virginia data doesn't tell us what state people are from

In [48]:
#prepare loan dataset
VA_CD = pd.read_excel("../Data/Post1790/VA/VA_CD.xlsx", 
                      header = 11, usecols = 'H, I, K, L, U, V, X, Y, AH, AI, AK, AL')
VA_CD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
VA_CD['state'] = np.nan
VA_CD_agg_pre = transformdf(VA_CD, 'VA')
VA_CD_agg = VA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [49]:
#prepare loan dataset
VA_ASD = pd.read_excel("../Data/Post1790/VA/VA_ASD.xlsx", 
                      header = 11, usecols = 'D, E, N, O, U, V, AE, AF, AL, AM, AW, AX')
VA_ASD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                  'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
VA_ASD['state'] = np.nan
VA_ASD_agg_pre = transformdf(VA_ASD, 'VA')
VA_ASD_agg = VA_ASD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                             '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [50]:
cumulative_CD = pd.concat([VA_CD_agg, cumulative_CD])

In [51]:
cumulative_ASD = pd.concat([VA_ASD_agg, cumulative_ASD])

# Georgia Continental Debt Dataset Matching

In [52]:
#prepare loan dataset
GA_CD = pd.read_excel("../Data/Post1790/GA/T694_GA_Loan_Office_CD.xlsx", 
                      header = 10, usecols = 'Q, R, T, Z, AA, AB, AC, AD, AE')
GA_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents',  
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
GA_CD_agg_pre = transformonecoldf(GA_CD, 'GA')
GA_CD_agg = GA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [53]:
cumulative_CD = pd.concat([GA_CD_agg, cumulative_CD])

# Summary Analysis

In [54]:
#no new jersey because it only has 3% stock

In [55]:
cumulative_ASD.reset_index(drop = True, inplace = True)
cumulative_CD.reset_index(drop = True, inplace = True)

In [56]:
cumulative_ASD['Total'] = cumulative_ASD[['6p_Dollar','6p_def_Dollar']].fillna(0).sum(axis = 1) + cumulative_ASD[['6p_Cents','6p_def_Cents']].fillna(0).sum(axis = 1)/100
cumulative_ASD.loc[cumulative_ASD[cumulative_ASD['state'].apply(lambda x: pd.isnull(x))].index, 'state'] = 'unspecified'

cumulative_CD['Total'] = cumulative_CD[['6p_Dollar','6p_def_Dollar']].fillna(0).sum(axis = 1) + cumulative_CD[['6p_Cents','6p_def_Cents']].fillna(0).sum(axis = 1)/100
cumulative_CD.loc[cumulative_CD[cumulative_CD['state'].apply(lambda x: pd.isnull(x))].index, 'state'] = 'unspecified'

In [57]:
#ASD Statistics
print(cumulative_ASD[cumulative_ASD['state'].apply(lambda x: x in ['unspecified', 'NY loan office'])]['Total'].sum())
print(cumulative_ASD['Total'].sum())
cumulative_ASD[cumulative_ASD['state'].apply(lambda x: x in ['unspecified', 'NY loan office'])]['Total'].sum()/cumulative_ASD['Total'].sum()

2168273.1746222223
3611474.3704666668


0.6003844835099973

In [58]:
#CD Statistics
print(cumulative_CD[cumulative_CD['state'].apply(lambda x: x in ['unspecified', 'NY loan office'])]['Total'].sum())
print(cumulative_CD['Total'].sum())
cumulative_CD[cumulative_CD['state'].apply(lambda x: x in ['unspecified', 'NY loan office'])]['Total'].sum()/cumulative_CD['Total'].sum()

3552155.41583
11274965.092275001


0.315048018930342

In [59]:
#using state
agg_table_ASD = cumulative_ASD.fillna(0)[['state', 'Total']].groupby('state').sum().reset_index()
agg_table_ASD = agg_table_ASD.round(2)
agg_table_CD = cumulative_CD.fillna(0)[['state', 'Total']].groupby('state').sum().reset_index()
agg_table_CD = agg_table_CD.round(2)
agg_table = pd.merge(agg_table_ASD,agg_table_CD, on = 'state', how = 'outer')
agg_table.columns = ['state', 'ASD Total', 'CD Total']

In [60]:
agg_table.to_csv('prepost_data_aggregations/statewise_debt_aggregation_Post1790.csv')

In [61]:
#using state - replace unspecified with debt state, replace NY loan office with nY
cumulative_ASD_rep = cumulative_ASD.copy()
cumulative_ASD_rep.loc[cumulative_ASD_rep[cumulative_ASD_rep['state'] == 'unspecified'].index, 'state'] = cumulative_ASD_rep[cumulative_ASD_rep['state'] == 'unspecified']['debt state']
cumulative_CD_rep = cumulative_CD.copy()
cumulative_CD_rep.loc[cumulative_CD_rep[cumulative_CD_rep['state'] == 'unspecified'].index, 'state'] = cumulative_CD_rep[cumulative_CD_rep['state'] == 'unspecified']['debt state']

In [62]:
agg_table_ASD_rep = cumulative_ASD_rep.fillna(0)[['state', 'Total']].groupby('state').sum().reset_index()
agg_table_ASD_rep = agg_table_ASD_rep.round(2)
agg_table_CD_rep = cumulative_CD_rep.fillna(0)[['state', 'Total']].groupby('state').sum().reset_index()
agg_table_CD_rep = agg_table_CD_rep.round(2)
agg_table_rep = pd.merge(agg_table_ASD_rep,agg_table_CD_rep, on = 'state', how = 'outer')
agg_table_rep.columns = ['state', 'ASD Total', 'CD Total']

In [63]:
agg_table_rep.to_csv('prepost_data_aggregations/statewise_debt_aggregation_Post1790_rep.csv')

In [64]:
#using debt state instead of state
agg_table_ASD_debtstate = cumulative_ASD.fillna(0)[['debt state', 'Total']].groupby('debt state').sum().reset_index()
agg_table_ASD_debtstate = agg_table_ASD_debtstate.round(2)
agg_table_CD_debtstate = cumulative_CD.fillna(0)[['debt state', 'Total']].groupby('debt state').sum().reset_index()
agg_table_CD_debtstate = agg_table_CD_debtstate.round(2)
agg_table_debtstate = pd.merge(agg_table_ASD_debtstate,agg_table_CD_debtstate, on = 'debt state', how = 'outer')
agg_table_debtstate.columns = ['debt state', 'ASD Total', 'CD Total']

In [65]:
agg_table_debtstate.to_csv('prepost_data_aggregations/statewise_debt_aggregation_Post1790_debtstate.csv')