<h3> Data Cleaning </h3>

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
import numpy as np
import pandas as pd
from rapidfuzz import process
from os.path import exists
import itertools
from whoswho import who

In [19]:
def stringConvert(x):
    return x.replace("  ", " ") if type(x) == str else ""

In [20]:
def combineLists(lst):
    returnlst = []
    for sublist in lst:
        if type(sublist) == list:
            returnlst.extend([item for item in sublist])
        else:
            returnlst.append(sublist)
    return returnlst

In [21]:
#function that makes dictionary that combines 3 full name columns into 1
def genFullNameList(namelst): 
    #remove duplicates and nulls
    namelst = list(set([name for name in namelst if not pd.isnull(name)]))
    namelst = sorted(namelst, key=len)
    #remove names that are really similar
    namelstnew = namelst
    if len(namelst) > 1:
        namelstnew = []
        name1 = namelst[0]
        namelstnew.append(name1)
        for name in namelst[1:]:
            score1  = process.extract(name1, [name])[0][1]
            score2 = who.match(name1, name)
            #only add names if they are dissimilar - fuzzy score 70 or less
            if score1 <= 70:
                namelstnew.append(name)
    return namelstnew

In [22]:
def genFuzzyDict(df):
    namelst = list(set([name for namelist in df['full name prelim'] for name in namelist]))
    #create dictionary that matches similar names together
    fn_fuzzy_pre = dict()
    for name in namelst:
        marker = False
        if not pd.isnull(name):
            #find matches for name
            match = process.extract(name, [x for x in namelst if x != name and not 
                                           pd.isnull(x)], limit = 1, score_cutoff = 90)
            if len(match)> 0:
                match = match[0]
                if match[1]>95:
                    #add suitable matches to dictionary
                    for nm in [match[0], name]:
                        if nm in fn_fuzzy_pre.keys() and not marker:
                            fn_fuzzy_pre[nm].extend([n for n in [match[0], name] if 
                                                     n != nm and n not in fn_fuzzy_pre[nm]])
                            marker = True
                    if not marker:
                        if len(name) < len(match[0]):
                            fn_fuzzy_pre[name] = [match[0]]
                        else:
                            fn_fuzzy_pre[match[0]] = [name]
    #invert dictionary
    fn_fuzzy = dict()
    for key in fn_fuzzy_pre.keys():
        vals = fn_fuzzy_pre[key]
        for val in vals:
            fn_fuzzy[val] = key
    
    return fn_fuzzy

In [23]:
#separate a string that contains two names into a list of two names
def parseNames(x):
    #replace words that don't have meaning
    x = x.replace("and Co", "").replace("and co", "").replace("and Others" ,"")
    x = x.replace("and others", "").replace("and Son", "").replace("and Sons", "")
    x = x.replace("and Brothers", "").strip()
    #string preprocessing
    namelst = x.split(" and ")
    namelst = [x.strip() for x in namelst if x.strip() != ""]
    if len(namelst) > 1:
        wd1len = len(namelst[0].split(" "))
        wd2len = len(namelst[1].split(" "))
        #add last name
        if wd1len == 1 and wd2len != 1:
            namelst[0] = namelst[0] + " " + namelst[1].split(" ")[-1]
    return namelst

In [24]:
def transformdf(df, state):
    #add full name columns
    df['full name 1'] = (df['First Name'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name'].apply(lambda x: stringConvert(x)))
    df['full name 1'] = df['full name 1'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['full name 2'] = (df['First Name.1'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name.1'].apply(lambda x: stringConvert(x)))
    df['full name 2'] = df['full name 2'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['full name 3'] = (df['First Name.2'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name.2'].apply(lambda x: stringConvert(x)))
    df['full name 3'] = df['full name 3'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['debt state'] = state
    #add dicionary to merge different full name columns into one
    df['full name prelim'] = [genFullNameList([fname1, fname2, fname3]) 
                              for fname1, fname2, fname3 in zip(df['full name 1'],
                                                                df['full name 2'],
                                                                df['full name 3'])]
    df['full name'] = df['full name prelim']
    #do some additional preprocessing
    df = df[df['full name'].apply(lambda x: x != [])]
    #separate names that are combined with "and", or otherwise treatde as one when they should be two
    df['full name'] = df['full name'].apply(lambda lst: [parseNames(x) if len(x.strip().split(" ")) > 2 and " and " in x 
                                                                 else x for x in lst])
    df['full name'] = df['full name'].apply(lambda namelist: combineLists(namelist))
    
    #fuzzy matching for different names in the full name column
    fn_fuzzy = genFuzzyDict(df)
    df['full name'] = df['full name'].apply(lambda lst: [x if x not in fn_fuzzy.keys() 
                                                         else fn_fuzzy[x] for x in lst])
    #simplify state into just one column
    if 'state1' in list(df.columns):
        state_list = [[s for s in list(set([s1, s2, s3])) if not pd.isnull(s)]
                      for s1, s2, s3 in zip(df['state1'], df['state2'], df['state3'])]
        df['state']  = [state[0] if state != [] else np.nan for state in state_list]
   
    #fill in potentially missing states
    missing_fullname = list(df[df['state'].apply(lambda x: pd.isnull(x))]['full name'])
    missing_ind = list(df[df['state'].apply(lambda x: pd.isnull(x))].index)
    replacement_states = []
    for name in missing_fullname:
        df_filt = df[df['full name'].apply(lambda x: x == name)] 
        if df_filt.shape[1] != 0:
            state = list(df_filt['state'])[0]
            replacement_states.append(state)
        else:
            replacement_states.append(np.nan)
    df.loc[missing_ind, 'state'] = replacement_states
    
    return df

In [25]:
def transformonecoldf(df, state):
    #do transformdf but for when you only have one full name oclumn
    df['full name prelim'] =  (df['First Name'] + " " + df['Last Name']).apply(lambda x: [x] if not pd.isnull(x) else [])
    df['debt state'] = state
    
    df['full name'] = df['full name prelim']
    #some preprocessing
    df = df[df['full name'].apply(lambda x: x != [])]
    #separate names that are combined with "and", or otherwise treated as one when they should be two
    df['full name'] = df['full name'].apply(lambda lst: [parseNames(x) if len(x.strip().split(" ")) > 2 and " and " in x 
                                                                 else x for x in lst])
    df['full name'] = df['full name'].apply(lambda namelist: combineLists(namelist))
    
    #fuzzy matching for different names in the full name column
    fn_fuzzy = genFuzzyDict(df)
    df['full name'] = df['full name'].apply(lambda lst: [x if x not in fn_fuzzy.keys() 
                                                         else fn_fuzzy[x] for x in lst]) 
    
    #fill in potentially missing states
    if list(set(df['state'])) != [np.nan]:
        missing_fullname = list(df[df['state'].apply(lambda x: pd.isnull(x))]['full name'])
        missing_ind = list(df[df['state'].apply(lambda x: pd.isnull(x))].index)
        replacement_states = []
        for name in missing_fullname:
            df_filt = df[df['full name'].apply(lambda x: x == name)] 
            if df_filt.shape[1] != 0:
                state = list(df_filt['state'])[0]
                replacement_states.append(state)
            else:
                replacement_states.append(np.nan)
        df.loc[missing_ind, 'state'] = replacement_states
        
    return df

# Connecticut Continental Debt Dataset Matching

In [28]:
#prepare loan dataset
CT_CD = pd.read_excel("Data/Post1790/CT/CT_post1790_CD_ledger.xlsx", 
                      header = 13, usecols = 'H, I, K, N, O, X, Y, AA, AD, AE, AN, AO, AQ, AT, AU')
CT_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents', ]
CT_CD_agg_pre = transformdf(CT_CD, 'CT')
CT_CD_agg = CT_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

# Maryland Continental Debt Dataset Matching

In [30]:
#prepare loan dataset
MD_CD = pd.read_excel("Data/Post1790/MD/MD_post1790_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, Z, AA, AI, AJ, AN, AO')
MD_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
MD_CD_agg_pre = transformdf(MD_CD, 'MD')
MD_CD_agg = MD_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [31]:
cumulative_CD = pd.concat([CT_CD_agg, MD_CD_agg])

# North Carolina Continental Debt Dataset Matching

In [32]:
#prepare loan dataset
NC_CD = pd.read_excel("Data/Post1790/NC/T695_R4_NC_CD.xlsx", 
                      header = 11, usecols = 'J, K, M, W, X, Z, AA, AC, AD ')
NC_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_CD_agg_pre = transformonecoldf(NC_CD, 'NC')
NC_CD_agg = NC_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [33]:
cumulative_CD = pd.concat([NC_CD_agg, cumulative_CD])

# New Hampshire Continental Debt Dataset Matching

In [34]:
#prepare loan dataset
NH_CD = pd.read_excel("Data/Post1790/NH/T652_R6_New_Hampshire_CD.xlsx", 
                      header = 10, usecols = 'I, J, L, N, O, P, Q, R, S')
NH_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents',  
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NH_CD_agg_pre = transformonecoldf(NH_CD, 'NH')
NH_CD_agg = NH_CD_agg_pre[['full name', 'state', 'debt state',  '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [35]:
cumulative_CD = pd.concat([NH_CD_agg, cumulative_CD])

# New York Continental Debt Dataset Matching

In [36]:
#new york data doesn't tell us what state people are from

In [37]:
#prepare loan dataset
NY_CD = pd.read_excel("Data/Post1790/NY/NY_1790_CD.xlsx", 
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_CD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_CD['state'] = np.nan
NY_CD_agg_pre = transformdf(NY_CD, 'NY')
NY_CD_agg = NY_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [38]:
cumulative_CD = pd.concat([NY_CD_agg, cumulative_CD])

# South Carolina Continental Debt Dataset Matching

In [39]:
#prepare loan dataset
SC_CD = pd.read_excel("Data/Post1790/SC/Post_1790_South_Carolina_CD.xlsx", 
                      header = 11, usecols = 'D, E, G, M, N, S, T, V, AB, AC, AH, AI, AK, AQ, AR')
SC_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2',  '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
SC_CD_agg_pre = transformdf(SC_CD, 'SC')
SC_CD_agg = SC_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [40]:
cumulative_CD = pd.concat([SC_CD_agg, cumulative_CD])

# Pennsylvania Continental Debt Dataset Matching

In [41]:
#prepare loan dataset
PA_CD = pd.read_excel("Data/Post1790/PA/PA_post1790_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AM, AO, AP')
PA_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
PA_CD_agg_pre = transformdf(PA_CD, 'PA')
PA_CD_agg = PA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [42]:
cumulative_CD = pd.concat([PA_CD_agg, cumulative_CD])

# Rhode Island Continental Debt Dataset Matching

In [43]:
#prepare loan dataset
RI_CD = pd.read_excel("Data/Post1790/RI/T653_Rhode_Island_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AL, AN, AO')
RI_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
RI_CD_agg_pre = transformdf(RI_CD, 'RI')
RI_CD_agg = RI_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [44]:
cumulative_CD = pd.concat([RI_CD_agg, cumulative_CD])

# Virginia Continental Debt Dataset Matching

In [45]:
#virginia data doesn't tell us what state people are from

In [46]:
#prepare loan dataset
VA_CD = pd.read_excel("Data/Post1790/VA/VA_CD.xlsx", 
                      header = 11, usecols = 'H, I, K, L, U, V, X, Y, AH, AI, AK, AL')
VA_CD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
VA_CD['state'] = np.nan
VA_CD_agg_pre = transformdf(VA_CD, 'VA')
VA_CD_agg = VA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [47]:
cumulative_CD = pd.concat([VA_CD_agg, cumulative_CD])

# Georgia Continental Debt Dataset Matching

In [48]:
#prepare loan dataset
GA_CD = pd.read_excel("Data/Post1790/GA/T694_GA_Loan_Office_CD.xlsx", 
                      header = 10, usecols = 'Q, R, T, Z, AA, AB, AC, AD, AE')
GA_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents',  
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
GA_CD_agg_pre = transformonecoldf(GA_CD, 'GA')
GA_CD_agg = GA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [49]:
cumulative_CD = pd.concat([GA_CD_agg, cumulative_CD])

# Summary Analysis

In [50]:
#no new jersey because it only has 3% stock

In [51]:
cumulative_CD.reset_index(drop = True, inplace = True)

In [76]:
agg_table = cumulative_CD[['state', '6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents']].groupby('state').sum().reset_index()
agg_table['Total'] = agg_table[['6p_Dollar', '6p_def_Dollar']].sum(axis = 1) + agg_table[['6p_Cents', '6p_def_Cents']].sum(axis = 1)/100
agg_table = agg_table[['state', 'Total']]
agg_table.round(2)

Unnamed: 0,state,Total
0,BM,1132.4
1,BVI,1868.3
2,CT,1346300.72
3,DE,12781.59
4,FR,6820.77
5,GA,7705.76
6,GB,401.77
7,MA,132421.46
8,MD,1354642.06
9,NC,54022.71
