<h3> Data Cleaning </h3>

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from rapidfuzz import process
from os.path import exists
import itertools
from whoswho import who

In [4]:
def stringConvert(x):
    return x.replace("  ", " ") if type(x) == str else ""

In [5]:
def combineLists(lst):
    returnlst = []
    for sublist in lst:
        if type(sublist) == list:
            returnlst.extend([item for item in sublist])
        else:
            returnlst.append(sublist)
    return returnlst

In [6]:
#function that makes dictionary that combines 3 full name columns into 1
def genFullNameList(namelst): 
    #remove duplicates and nulls
    namelst = list(set([name for name in namelst if not pd.isnull(name)]))
    namelst = sorted(namelst, key=len)
    #remove names that are really similar
    namelstnew = namelst
    if len(namelst) > 1:
        namelstnew = []
        name1 = namelst[0]
        namelstnew.append(name1)
        for name in namelst[1:]:
            score1  = process.extract(name1, [name])[0][1]
            score2 = who.match(name1, name)
            #only add names if they are dissimilar - fuzzy score 70 or less
            if score1 <= 70:
                namelstnew.append(name)
    return namelstnew

In [7]:
def genFuzzyDict(df):
    namelst = list(set([name for namelist in df['full name prelim'] for name in namelist]))
    #create dictionary that matches similar names together
    fn_fuzzy_pre = dict()
    for name in namelst:
        marker = False
        if not pd.isnull(name):
            #find matches for name
            match = process.extract(name, [x for x in namelst if x != name and not 
                                           pd.isnull(x)], limit = 1, score_cutoff = 90)
            if len(match)> 0:
                match = match[0]
                if match[1]>95:
                    #add suitable matches to dictionary
                    for nm in [match[0], name]:
                        if nm in fn_fuzzy_pre.keys() and not marker:
                            fn_fuzzy_pre[nm].extend([n for n in [match[0], name] if 
                                                     n != nm and n not in fn_fuzzy_pre[nm]])
                            marker = True
                    if not marker:
                        if len(name) < len(match[0]):
                            fn_fuzzy_pre[name] = [match[0]]
                        else:
                            fn_fuzzy_pre[match[0]] = [name]
    #invert dictionary
    fn_fuzzy = dict()
    for key in fn_fuzzy_pre.keys():
        vals = fn_fuzzy_pre[key]
        for val in vals:
            fn_fuzzy[val] = key
    
    return fn_fuzzy

In [8]:
#separate a string that contains two names into a list of two names
def parseNames(x):
    #replace words that don't have meaning
    x = x.replace("and Co", "").replace("and co", "").replace("and Others" ,"")
    x = x.replace("and others", "").replace("and Son", "").replace("and Sons", "")
    x = x.replace("and Brothers", "").strip()
    #string preprocessing
    namelst = x.split(" and ")
    namelst = [x.strip() for x in namelst if x.strip() != ""]
    if len(namelst) > 1:
        wd1len = len(namelst[0].split(" "))
        wd2len = len(namelst[1].split(" "))
        #add last name
        if wd1len == 1 and wd2len != 1:
            namelst[0] = namelst[0] + " " + namelst[1].split(" ")[-1]
    return namelst

In [9]:
def transformdf(df, state):
    #add full name columns
    df['full name 1'] = (df['First Name'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name'].apply(lambda x: stringConvert(x)))
    df['full name 1'] = df['full name 1'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['full name 2'] = (df['First Name.1'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name.1'].apply(lambda x: stringConvert(x)))
    df['full name 2'] = df['full name 2'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['full name 3'] = (df['First Name.2'].apply(lambda x: stringConvert(x))
                         + " " + 
                         df['Last Name.2'].apply(lambda x: stringConvert(x)))
    df['full name 3'] = df['full name 3'].apply(lambda x: x if len(x.strip().split(" ")) > 1 else np.nan)
    df['debt state'] = state
    #add dicionary to merge different full name columns into one
    df['full name prelim'] = [genFullNameList([fname1, fname2, fname3]) 
                              for fname1, fname2, fname3 in zip(df['full name 1'],
                                                                df['full name 2'],
                                                                df['full name 3'])]
    df['full name'] = df['full name prelim']
    #do some additional preprocessing
    df = df[df['full name'].apply(lambda x: x != [])]
    #separate names that are combined with "and", or otherwise treatde as one when they should be two
    df['full name'] = df['full name'].apply(lambda lst: [parseNames(x) if len(x.strip().split(" ")) > 2 and " and " in x 
                                                                 else x for x in lst])
    df['full name'] = df['full name'].apply(lambda namelist: combineLists(namelist))
    
    #fuzzy matching for different names in the full name column
    fn_fuzzy = genFuzzyDict(df)
    df['full name'] = df['full name'].apply(lambda lst: [x if x not in fn_fuzzy.keys() 
                                                         else fn_fuzzy[x] for x in lst])
    #simplify state into just one column
    if 'state1' in list(df.columns):
        state_list = [[s for s in list(set([s1, s2, s3])) if not pd.isnull(s)]
                      for s1, s2, s3 in zip(df['state1'], df['state2'], df['state3'])]
        df['state']  = [state[0] if state != [] else np.nan for state in state_list]
   
    #fill in potentially missing states
    missing_fullname = list(df[df['state'].apply(lambda x: pd.isnull(x))]['full name'])
    missing_ind = list(df[df['state'].apply(lambda x: pd.isnull(x))].index)
    replacement_states = []
    for name in missing_fullname:
        df_filt = df[df['full name'].apply(lambda x: x == name)] 
        if df_filt.shape[1] != 0:
            state = list(df_filt['state'])[0]
            replacement_states.append(state)
        else:
            replacement_states.append(np.nan)
    df.loc[missing_ind, 'state'] = replacement_states
    
    return df

In [10]:
def transformonecoldf(df, state):
    #do transformdf but for when you only have one full name oclumn
    df['full name prelim'] =  (df['First Name'].apply(lambda x: stringConvert(x)) + " " + 
                               df['Last Name'].apply(lambda x: stringConvert(x))).apply(lambda x: x.strip())
    df['full name prelim'] = df['full name prelim'].apply(lambda x: [x] if x != "" else [])
    
    df['debt state'] = state
    
    df['full name'] = df['full name prelim']
    #some preprocessing
    df = df[df['full name'].apply(lambda x: x != [])]
    #separate names that are combined with "and", or otherwise treated as one when they should be two
    df['full name'] = df['full name'].apply(lambda lst: [parseNames(x) if len(x.strip().split(" ")) > 2 and " and " in x 
                                                                 else x for x in lst])
    df['full name'] = df['full name'].apply(lambda namelist: combineLists(namelist))
    
    #fuzzy matching for different names in the full name column
    fn_fuzzy = genFuzzyDict(df)
    df['full name'] = df['full name'].apply(lambda lst: [x if x not in fn_fuzzy.keys() 
                                                         else fn_fuzzy[x] for x in lst]) 
    
    #fill in potentially missing states
    if list(set(df['state'])) != [np.nan]:
        missing_fullname = list(df[df['state'].apply(lambda x: pd.isnull(x))]['full name'])
        missing_ind = list(df[df['state'].apply(lambda x: pd.isnull(x))].index)
        replacement_states = []
        for name in missing_fullname:
            df_filt = df[df['full name'].apply(lambda x: x == name)] 
            if df_filt.shape[1] != 0:
                state = list(df_filt['state'])[0]
                replacement_states.append(state)
            else:
                replacement_states.append(np.nan)
        df.loc[missing_ind, 'state'] = replacement_states
        
    return df

# Connecticut Continental Debt Dataset Matching

In [11]:
#prepare loan dataset
CT_CD = pd.read_excel("../Data/Post1790/CT/CT_post1790_CD_ledger.xlsx", 
                      header = 13, usecols = 'H, I, K, N, O, X, Y, AA, AD, AE, AN, AO, AQ, AT, AU')
CT_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                  'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents', ]
CT_CD_agg_pre = transformdf(CT_CD, 'CT')
CT_CD_agg = CT_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [12]:
CT_CD_agg

Unnamed: 0,full name,state,debt state,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents
0,[Samuel W Pomeroy],CT,CT,1064.0,75.0,532.0,37.0,508.0,51.0
1,[Benjamin Trumbull],CT,CT,449.0,96.0,224.0,97.0,232.0,10.0
2,[Richard Green],RI,CT,154.0,20.0,77.0,10.0,192.0,
3,[Thomas Hopkins],CT,CT,196.0,75.0,98.0,37.0,172.0,24.0
4,[John Morgan],CT,CT,53.0,58.0,26.0,79.0,67.0,6.0
...,...,...,...,...,...,...,...,...,...
1002,[John Morgan],CT,CT,,,,,46.0,30.0
1003,[Samuel W Pomeroy],CT,CT,,,,,,2.0
1004,[William H Imlay],CT,CT,237.0,87.0,277.0,64.0,166.0,54.0
1005,[Jonathan Palmer],CT,CT,196.0,63.0,98.0,31.0,223.0,92.0


# Maryland Continental Debt Dataset Matching

In [11]:
#prepare loan dataset
MD_CD = pd.read_excel("../Data/Post1790/MD/MD_post1790_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, Z, AA, AI, AJ, AN, AO')
MD_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
MD_CD_agg_pre = transformdf(MD_CD, 'MD')
MD_CD_agg = MD_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [12]:
cumulative_CD = pd.concat([CT_CD_agg, MD_CD_agg])

# North Carolina Continental Debt Dataset Matching

In [13]:
#prepare loan dataset
NC_CD = pd.read_excel("../Data/Post1790/NC/T695_R4_NC_CD.xlsx", 
                      header = 11, usecols = 'J, K, M, W, X, Z, AA, AC, AD ')
NC_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents', 
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NC_CD_agg_pre = transformonecoldf(NC_CD, 'NC')
NC_CD_agg = NC_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [14]:
cumulative_CD = pd.concat([NC_CD_agg, cumulative_CD])

# New Hampshire Continental Debt Dataset Matching

In [15]:
#prepare loan dataset
NH_CD = pd.read_excel("../Data/Post1790/NH/T652_R6_New_Hampshire_CD.xlsx", 
                      header = 10, usecols = 'I, J, L, N, O, P, Q, R, S')
NH_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents',  
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
NH_CD_agg_pre = transformonecoldf(NH_CD, 'NH')
NH_CD_agg = NH_CD_agg_pre[['full name', 'state', 'debt state',  '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [16]:
cumulative_CD = pd.concat([NH_CD_agg, cumulative_CD])

# New York Continental Debt Dataset Matching

In [17]:
#new york data doesn't tell us what state people are from

In [18]:
#prepare loan dataset
NY_CD = pd.read_excel("../Data/Post1790/NY/NY_1790_CD.xlsx", 
                      header = 11, usecols = 'H, I, M, N, X, Y, AC, AD, AM, AN, AR, AS')
NY_CD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
NY_CD['state'] = np.nan
NY_CD_agg_pre = transformdf(NY_CD, 'NY')
NY_CD_agg = NY_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [19]:
cumulative_CD = pd.concat([NY_CD_agg, cumulative_CD])

# South Carolina Continental Debt Dataset Matching

In [20]:
#prepare loan dataset
SC_CD = pd.read_excel("../Data/Post1790/SC/Post_1790_South_Carolina_CD.xlsx", 
                      header = 11, usecols = 'D, E, G, M, N, S, T, V, AB, AC, AH, AI, AK, AQ, AR')
SC_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2',  '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
SC_CD_agg_pre = transformdf(SC_CD, 'SC')
SC_CD_agg = SC_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [21]:
cumulative_CD = pd.concat([SC_CD_agg, cumulative_CD])

# Pennsylvania Continental Debt Dataset Matching

In [22]:
#prepare loan dataset
PA_CD = pd.read_excel("../Data/Post1790/PA/PA_post1790_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AM, AO, AP')
PA_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                 'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
PA_CD_agg_pre = transformdf(PA_CD, 'PA')
PA_CD_agg = PA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [23]:
cumulative_CD = pd.concat([PA_CD_agg, cumulative_CD])

# Rhode Island Continental Debt Dataset Matching

In [24]:
#prepare loan dataset
RI_CD = pd.read_excel("../Data/Post1790/RI/T653_Rhode_Island_CD.xlsx", 
                      header = 11, usecols = 'G, H, J, L, M, U, V, X, Z, AA, AI, AJ, AL, AN, AO')
RI_CD.columns = ['First Name', 'Last Name', 'state1', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', 'state2', '6p_def_Dollar', '6p_def_Cents',
                'First Name.2', 'Last Name.2', 'state3', '3p_Dollar', '3p_Cents']
RI_CD_agg_pre = transformdf(RI_CD, 'RI')
RI_CD_agg = RI_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [25]:
cumulative_CD = pd.concat([RI_CD_agg, cumulative_CD])

# Virginia Continental Debt Dataset Matching

In [26]:
#virginia data doesn't tell us what state people are from

In [27]:
#prepare loan dataset
VA_CD = pd.read_excel("../Data/Post1790/VA/VA_CD.xlsx", 
                      header = 11, usecols = 'H, I, K, L, U, V, X, Y, AH, AI, AK, AL')
VA_CD.columns = ['First Name', 'Last Name', '6p_Dollar', '6p_Cents', 
                 'First Name.1', 'Last Name.1', '6p_def_Dollar', '6p_def_Cents',
                'First Name.2', 'Last Name.2', '3p_Dollar', '3p_Cents']
VA_CD['state'] = np.nan
VA_CD_agg_pre = transformdf(VA_CD, 'VA')
VA_CD_agg = VA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [28]:
cumulative_CD = pd.concat([VA_CD_agg, cumulative_CD])

# Georgia Continental Debt Dataset Matching

In [29]:
#prepare loan dataset
GA_CD = pd.read_excel("../Data/Post1790/GA/T694_GA_Loan_Office_CD.xlsx", 
                      header = 10, usecols = 'Q, R, T, Z, AA, AB, AC, AD, AE')
GA_CD.columns = ['First Name', 'Last Name', 'state', '6p_Dollar', '6p_Cents',  
                 '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']
GA_CD_agg_pre = transformonecoldf(GA_CD, 'GA')
GA_CD_agg = GA_CD_agg_pre[['full name', 'state', 'debt state', '6p_Dollar', '6p_Cents', 
                           '6p_def_Dollar', '6p_def_Cents', '3p_Dollar', '3p_Cents']]

In [30]:
cumulative_CD = pd.concat([GA_CD_agg, cumulative_CD])

# Summary Analysis

In [31]:
#no new jersey because it only has 3% stock

In [32]:
cumulative_CD.reset_index(drop = True, inplace = True)

In [33]:
cumulative_CD['full name'] = cumulative_CD['full name'].apply(lambda lst: [x for x in lst if len(x.split(" ")) > 1])
cumulative_CD = cumulative_CD[cumulative_CD['full name'].apply(lambda x: x != [])]

In [34]:
#turn full name column from list into strings
fname = cumulative_CD['full name'].apply(pd.Series)
nnames = len(fname.columns)
colnames = ['full name ' + str(i) for i in np.arange(1, nnames+1, 1)]
fname.columns = colnames
cumulative_CD = pd.concat([cumulative_CD, fname], axis = 1)

## How many unique individuals were issued 6 percent stocks or deferred 6 percent stocks in 1790 and after?

In [35]:
stocks_6 = cumulative_CD[['6p_Dollar', '6p_Cents', 
                          '6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index
print('table of number of unique individuals issued 6% stocks (normal or deferred) by state')
cumulative_CD.groupby('state')['full name'].agg(sum).apply(lambda x: len(set(x)))

table of number of unique individuals issued 6% stocks (normal or deferred) by state


state
BM       2
BVI      4
CT     689
DE       8
FR       2
GA      14
GB       2
MA     108
MD     278
NC      41
NH     132
NJ      32
NY      40
PA     618
RI     344
SC     208
US       1
VA      31
VI       4
VT       7
Name: full name, dtype: int64

# How many of these individuals
- were original purchasers of loan office certicates of the same state as the 6 percent stock?
- were original purchasers of loan office certicates issued from another state?
- were original recipients of liquidated debtcerti cates issued by the same-state loan office? other state loan offices?
- were original recipients of the Pierce Certicates?

In [36]:
def fuzzy_merge(lst1, lst2, threshold=85, limit = 100):
    delegates = pd.Series([x for x in lst1.unique() if not pd.isnull(x)])
    possible =  [x for x in lst2.unique().tolist() if type(x) == str]
 
    #get matches
    #process.extract uses a combination of all four fuzzywuzzy scores
    matches = delegates.apply(lambda x: 
                              process.extract(x, possible, limit=limit, score_cutoff = threshold))
    
    match_df = pd.DataFrame(columns = ['Delegates', 'Loan Matches'])
    #make each match a row in the dataframe
    for delegate, matchset in zip(delegates, matches):
        matchset_thres = [name for name in matchset if name[1] >= threshold]
        if len(matchset_thres) == 0:
            add_df = pd.DataFrame(data = {'Delegates': [delegate], 'Loan Matches': [""], 'Scores': [0]})
            match_df = pd.concat([match_df, add_df])
        else:
            delegate_lst = [delegate] * len(matchset_thres)
            add_df = pd.DataFrame(data = {'Delegates': delegate_lst, 
                                          'Loan Matches': [x[0] for x in matchset_thres],
                                          'Scores': [x[1] for x in matchset_thres]})
            match_df = pd.concat([match_df, add_df])

    return match_df

In [37]:
#function for performing the second step of the match
def matchFunction(lst1, lst2, score = 90):
    #check if our matches are actually min 2 words each
    #make sure our match is because the individual are similar, not because the phrase or one word in the phrase is similar
    #lst1 = list(pd.Series(lst1).unique())
    #lst2 = list(pd.Series(lst2).unique())
    threshold = min(len(lst1), len(lst2))
    matches = 0
    nomatch = []
    i = 0
    for wd1 in lst1:
        #modifying which words we compare - dont want to compare first in lst1 with last in lst2
        for wd2 in lst2:
            if wd1 not in nomatch and process.extract(wd1, [wd2])[0][1] > score:
                matches+=1
                nomatch.append(wd1)
        i+=1
    return matches >= threshold

In [38]:
def produceMatches(delegates, debt, delegate_names, debt_names, threshold = 85):
    initial = True
    join_df = pd.DataFrame()
    #run firs step of matching function
    for del_name in delegate_names:
        for debt_name in debt_names:
            if initial:
                join_df = fuzzy_merge(delegates[del_name], debt[debt_name], threshold)
                initial = False
            else:
                add_df = fuzzy_merge(delegates[del_name], debt[debt_name], threshold)
                join_df = pd.concat([join_df, add_df])
    join_df = join_df.drop_duplicates().reset_index(drop = True)
    join_df = join_df[join_df['Scores'].apply(lambda x: x != 0)]
    join_df = join_df[join_df['Loan Matches'].apply(lambda x: not pd.isnull(x))]    
    #cleaning
    join_df_p2 = join_df[join_df['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
    #run second step of matching function
    join_df_p2_final = join_df_p2[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(join_df_p2['Delegates'], join_df_p2['Loan Matches'])]]
    #select only the highest scoring loan match name pairing 
    join_df_p2_final.sort_values(by = 'Scores', ascending = False, inplace = True)
    join_df_p2_final_ind = join_df_p2_final[['Delegates','Loan Matches']].drop_duplicates().index
    join_df_p2_final = join_df_p2_final.loc[join_df_p2_final_ind]
    
    return join_df_p2_final

In [39]:
#import and preprocess loan office data
loan_office = pd.read_csv('../Data/Pre1790/cleaned/loan_office_certificates_9_states_cleaned.csv', index_col = 0)
states = ['NH', 'MA', 'CT', 'NY', 'NJ', 'PA', 'DE', 'MD', 'VA']
num_names = [1, 2, 2, 3, 2, None, 2, None, None]
state_names = dict(zip(np.arange(1, 10, 1), states))
loan_office['State Name'] = loan_office['State'].apply(lambda x: state_names[x])
loan_office['Full Name 1'] = (loan_office['First Name 1 '].apply(lambda x: stringConvert(x)) + " " + loan_office['Last Name 1 '].apply(lambda x: stringConvert(x)))
loan_office['Full Name 2'] = (loan_office['First Name 2'].apply(lambda x: stringConvert(x)) + " " + loan_office['Last Name 2'].apply(lambda x: stringConvert(x)))
loan_office['Full Name 3'] = (loan_office['First Name 3'].apply(lambda x: stringConvert(x)) + " " + loan_office['Last Name 3'].apply(lambda x: stringConvert(x)))

### How many individuals were original purchasers of loan office certicates of the same state as the 6 percent stock?

In [40]:
state_select = colnames
state_select.extend(['state'])
state_cols = ['cd name ' + str(i) for i in np.arange(1, len(state_select))]
state_cols.extend(['cd state'])

In [41]:
#match cd debt data with loan office data from the same state
def loanOfficeSameState(state):
    #filter for 6% stock
    state_ind = cumulative_CD[cumulative_CD['debt state'].apply(lambda x: x == state)][['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index
    #skip empty dataframes
    if len(state_ind) != 0:
        #prepare state and loan office data
        state_cd = cumulative_CD.loc[state_ind][state_select].drop_duplicates()
        state_cd.columns = state_cols
        loan_office_state = loan_office[loan_office['State Name'] == state][['Full Name 1', 'Full Name 2', 'Full Name 3', 'State Name']].drop_duplicates()
        loan_office_state.columns = ['loan office name 1', 'loan office name 2', 'loan office name 3', 'loan office state']
        #match data
        matches = produceMatches(state_cd, loan_office_state, 
                                 delegate_names = [x for x in state_cols if 'state' not in x],
                                 debt_names = ['loan office name 1', 'loan office name 2', 'loan office name 3'], threshold = 85)
        matches['state'] = state
        return matches

In [42]:
df_loanoffice_samestate = pd.DataFrame({}, columns = ['Delegates', 'Loan Matches', 'Scores', 'state'])

In [43]:
#combine matches from all the states
for state in states:
    df_loanoffice_samestate = pd.concat([df_loanoffice_samestate, loanOfficeSameState(state)])
df_loanoffice_samestate.columns = ['CD name', 'Loan Office name', 'Scores', 'state']

In [44]:
df_loanoffice_samestate = df_loanoffice_samestate[df_loanoffice_samestate['CD name'].apply(lambda x: len(x.split(" ")) > 1)]

In [45]:
df_loanoffice_samestate.reset_index(drop = True, inplace = True)

In [46]:
#summarize results
print("Number of individuals who were original purchasers of loan office certicates of the same state as the 6 percent stock")
df_loanoffice_samestate.groupby('state')['CD name'].apply(lambda x: len(x.unique()))

Number of individuals who were original purchasers of loan office certicates of the same state as the 6 percent stock


state
CT    344
DE     10
MA     84
MD    123
NH     70
NJ     43
NY     45
PA    394
VA     71
Name: CD name, dtype: int64

### How many individuals were original purchasers of loan office certicates issued from another state?

In [47]:
#higher match threshold for non-same state loan office certificates

In [48]:
#match cd debt data with loan office data from a different state
def loanOfficeDifState(state):
    #filter for 6% stock
    state_ind = cumulative_CD[cumulative_CD['debt state'].apply(lambda x: x == state)][['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index
    if len(state_ind) != 0:
        state_cd = cumulative_CD.loc[state_ind][state_select].drop_duplicates()
        state_cd.columns = state_cols
        loan_office_nostate = loan_office[loan_office['State Name'] != state][['Full Name 1', 'Full Name 2', 'Full Name 3', 'State Name']].drop_duplicates()
        loan_office_nostate.columns = ['loan office name 1', 'loan office name 2', 'loan office name 3', 'loan office state']
        #match data
        matches = produceMatches(state_cd, loan_office_nostate, 
                                 delegate_names = [x for x in state_cols if 'state' not in x],
                                 debt_names = ['loan office name 1', 'loan office name 2', 'loan office name 3'], threshold = 95)
        matches['state'] = state
        return matches

In [49]:
df_loanoffice_difstate = pd.DataFrame({}, columns = ['Delegates', 'Loan Matches', 'Scores', 'state'])

In [50]:
#combine matches from all the states
for state in states:
    df_loanoffice_difstate = pd.concat([df_loanoffice_difstate, loanOfficeDifState(state)])
df_loanoffice_difstate.columns = ['CD name', 'Loan Office name', 'Scores', 'state']

In [51]:
df_loanoffice_difstate = df_loanoffice_difstate[df_loanoffice_difstate['CD name'].apply(lambda x: len(x.split(" ")) > 1)]

In [52]:
df_loanoffice_difstate.reset_index(drop = True, inplace = True)

In [53]:
df_loanoffice_difstate.groupby('state')['CD name'].apply(lambda x: len(x.unique()))

state
CT    372
DE    302
MA    313
MD    326
NH    317
NJ    300
NY    288
PA    264
VA    261
Name: CD name, dtype: int64

### How many individuals were original recipients of liquidated debt certificates issued by the same-state loan office? other state loan offices?

#### Same State

In [54]:
#match CD state data with liquidated debt from the same state
def liquidatedSameStateDebt(state, file, num_names):
    #filter for 6% stock   
    state_ind = cumulative_CD[cumulative_CD['debt state'].apply(lambda x: x == state)][['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index
    if len(state_ind) != 0:
        state_cd = cumulative_CD.loc[state_ind][state_select].drop_duplicates()
        state_cd.columns = state_cols
        #import liquidated state debt files
        datafile = '../Data/Pre1790/cleaned/'+file
        if exists(datafile):
            state_cert = pd.read_csv(datafile, index_col = 0)
            namelst = []
            #figure out how many full name columns there are in the state liquidated debt file
            state_cert['Full Name'] = state_cert['First name'] + " " + state_cert['Last name'] 
            namelst.append('Full Name')
            if num_names > 1:
                for i in np.arange(2, num_names+1, 1):
                    fullname_str = 'Full Name ' + str(i)
                    state_cert[fullname_str] = state_cert['First name ' + str(i)] + " " + state_cert['Last name ' + str(i)] 
                    namelst.append(fullname_str)
            state_cert_names = state_cert[namelst].drop_duplicates()
            #produce matches
            matches = produceMatches(state_cd, state_cert_names, 
                                     delegate_names = state_cols, debt_names = namelst, 
                                     threshold = 85)
            matches['state'] = state
            return matches

In [55]:
df_samestateliquid = pd.DataFrame({}, columns = ['Delegates', 'Loan Matches', 'Scores', 'state'])

In [56]:
state_name = dict(zip(states, num_names))

In [57]:
#combine matches from all the states
for state, num_name in state_name.items():
    if state != "PA":
        file = 'liquidated_debt_certificates_'+state+'_cleaned.csv'
        df_samestateliquid = pd.concat([df_samestateliquid, liquidatedSameStateDebt(state, file, num_name)])
    else:
        file1 = 'liquidated_debt_certificates_PA_story_cleaned.csv'
        df1 = liquidatedSameStateDebt('PA', file1, 1)
        file2 = 'liquidated_debt_certificates_PA_stelle_cleaned.csv'
        df2 = liquidatedSameStateDebt('PA', file2, 2)
        df = pd.concat([df1, df2]).drop_duplicates()
        df_samestateliquid = pd.concat([df_samestateliquid, df])
df_samestateliquid.columns = ['CD name', 'Loan Office name', 'Scores', 'state']

In [58]:
df_samestateliquid = df_samestateliquid[df_samestateliquid['CD name'].apply(lambda x: len(x.split(" ")) > 1)]

In [59]:
df_samestateliquid.reset_index(drop = True, inplace = True)

In [60]:
#summarize results
df_samestateliquid.groupby('state')['CD name'].apply(lambda x: len(x.unique()))

state
CT    121
DE     12
MA     51
NH     32
NJ     74
NY    106
PA    302
Name: CD name, dtype: int64

#### Different State

In [61]:
def produceLiquidatedMatches(datafile, num_names, state_cd):
    if exists(datafile):
        state_cert = pd.read_csv(datafile, index_col = 0)
        namelst = []
        #figure out how many full name columns there are in the state liquidated debt file
        state_cert['Full Name'] = state_cert['First name'] + " " + state_cert['Last name'] 
        namelst.append('Full Name')
        if num_names > 1:
            for i in np.arange(2, num_names+1, 1):
                fullname_str = 'Full Name ' + str(i)
                state_cert[fullname_str] = state_cert['First name ' + str(i)] + " " + state_cert['Last name ' + str(i)] 
                namelst.append(fullname_str)
        state_cert_names = state_cert[namelst].drop_duplicates()
        state_names = [x for x in state_cd.columns if x != 'cd state']
        #produce matches
        matches = produceMatches(state_cd, state_cert_names, 
                                 delegate_names = state_names, debt_names = namelst, 
                                 threshold = 95)
        matches['state'] = state
        return matches

In [62]:
#match CD state data with liquidated debt from a dif state
def liquidatedDifStateDebt(state):
    #filter for 6% stock   
    state_ind = cumulative_CD[cumulative_CD['debt state'].apply(lambda x: x == state)][['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index
    if len(state_ind) != 0:
        state_cd = cumulative_CD.loc[state_ind][state_select].drop_duplicates()
        state_cd.columns = state_cols
        #import liquidated state debt files
        match_df = pd.DataFrame({}, columns = ['Delegates', 'Loan Matches', 'Scores'])
        for statename, num_names in state_name.items():
            if not pd.isnull(num_names):
                datafile = '../Data/Pre1790/cleaned/liquidated_debt_certificates_'+statename+'_cleaned.csv'
                matches = produceLiquidatedMatches(datafile, num_names, state_cd)
                match_df = pd.concat([match_df, matches])
            elif state == 'PA':    
                datafile1 = '../Data/Pre1790/cleaned/liquidated_debt_certificates_PA_story_cleaned.csv'
                matches1 = produceLiquidatedMatches(datafile, 1, state_cd)
                datafile2 = '../Data/Pre1790/cleaned/liquidated_debt_certificates_PA_stelle_cleaned.csv'
                matches2 = produceLiquidatedMatches(datafile, 2, state_cd)
                match_df = pd.concat([match_df, matches2])
        match_df['state'] = state
        return match_df

In [63]:
df_difstateliquid = pd.DataFrame({}, columns = ['Delegates', 'Loan Matches', 'Scores', 'state'])

In [64]:
#combine matches from all the states
for state in states:
    df_difstateliquid = pd.concat([df_difstateliquid, liquidatedDifStateDebt(state)])
df_difstateliquid.columns = ['CD name', 'Loan Office name', 'Scores', 'state']

In [65]:
df_difstateliquid.reset_index(drop = True, inplace = True)

In [66]:
df_difstateliquid.groupby('state')['CD name'].apply(lambda x: len(x.unique()))

state
CT    307
DE    144
MA    175
MD    174
NH    186
NJ    152
NY    148
PA    223
VA    143
Name: CD name, dtype: int64

### How many individuals were original recipients of the Pierce Certicates?

In [67]:
pierce = pd.read_csv('../Data/Pre1790/cleaned/'+"Pierce_Certs_cleaned_2021.csv", index_col = 0)
pierce['Full Name'] = pierce['First'].apply(lambda x: stringConvert(x)) + " " + pierce['Last'].apply(lambda x: stringConvert(x))
pierce['Full Name 2'] = pierce['First 2'].apply(lambda x: stringConvert(x)) + " " + pierce['Last 2'].apply(lambda x: stringConvert(x))

In [68]:
#match cd debt data with loan office data from the Pierce certificate data
def pierceCertificates(state):
    #filter for 6% stock
    state_ind = cumulative_CD[cumulative_CD['debt state'].apply(lambda x: x == state)][['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index
    if len(state_ind) != 0:
        state_cd = cumulative_CD.loc[state_ind][state_select].drop_duplicates()
        state_cd.columns = state_cols
        #match data
        pierce_names = pierce[pierce['State'].apply(lambda x: pd.isnull(x) or x == state)][['Full Name', 'Full Name 2']].drop_duplicates()
        matches = produceMatches(state_cd, pierce_names, 
                                 delegate_names = ['cd name 1'], 
                                 debt_names = ['Full Name', 'Full Name 2'], 
                                 threshold = 95)
        matches['state'] = state
        return matches

In [69]:
df_pierce = pd.DataFrame({}, columns = ['Delegates', 'Loan Matches', 'Scores', 'state'])

In [70]:
for state in states:
    df_pierce = pd.concat([df_pierce, pierceCertificates(state)])
df_pierce.columns = ['CD name', 'Loan Office name', 'Scores', 'state']

In [71]:
df_pierce = df_pierce[df_pierce['CD name'].apply(lambda x: len(x.split(" ")) > 1)]

In [72]:
df_pierce.groupby('state')['CD name'].apply(lambda x: len(x.unique()))

state
CT    331
DE    168
MA    224
MD    242
NH    197
NJ    171
NY    202
PA    308
VA    204
Name: CD name, dtype: int64

In [73]:
df_pierce.reset_index(drop = True, inplace = True)

## Organizing all our name matchg pairs into one table

In [74]:
#column to add the matching names into a total table containing cd loan names
#and corresopnding match names for each pre1790 loan times
def mergeNames(colname, df):
    colnames = [colname + ' 1', colname + ' 2', colname + ' 3', colname + ' 4', colname + ' 5']
    loss_dict = dict(df.groupby('CD name')['Loan Office name'].apply(lambda x: list(x)))
    cumulative_CD[colname + ' 1'] = cumulative_CD['full name 1'].apply(lambda name: 
                                                                       loss_dict.get(name, np.nan))
    cumulative_CD[colname + ' 2'] = cumulative_CD['full name 2'].apply(lambda name: 
                                                                       loss_dict.get(name, np.nan))
    cumulative_CD[colname + ' 3'] = cumulative_CD['full name 3'].apply(lambda name: 
                                                                       loss_dict.get(name, np.nan))
    cumulative_CD[colname + ' 4'] = cumulative_CD['full name 4'].apply(lambda name: 
                                                                       loss_dict.get(name, np.nan))
    cumulative_CD[colname + ' 5'] = cumulative_CD['full name 5'].apply(lambda name: 
                                                                       loss_dict.get(name, np.nan))
    cumulative_CD[colname] = cumulative_CD[colnames].values.tolist()
    cumulative_CD[colname] = cumulative_CD[colname].apply(lambda lst: list(set(combineLists([x for x in lst if type(x) != float]))))
    cumulative_CD[colname] = cumulative_CD[colname].apply(lambda x: x if x != [] else np.nan)
    cumulative_CD.drop(colnames, inplace = True, axis = 1)

In [75]:
#run process on all four pre1790 loan types
mergeNames('Same State Loan Office', df_loanoffice_samestate)
mergeNames('Different State Loan Office', df_loanoffice_difstate)
mergeNames('Same State Liquidated Debt', df_samestateliquid)
mergeNames('Different State Liquidated Debt', df_difstateliquid)
mergeNames('Pierce Certificates', df_pierce)

In [76]:
#number of loan types one person had
cumulative_CD['tot_count'] = 5 - cumulative_CD[['Same State Loan Office',
                                                'Different State Loan Office',
                                                'Same State Liquidated Debt',
                                                'Different State Liquidated Debt',
                                                'Pierce Certificates']].isna().sum(axis = 1)
cumulative_CD.drop(['3p_Cents','3p_Dollar', 'full name 1', 
                    'full name 2','full name 3', 'full name 4',
                    'full name 5'], axis = 1, inplace = True)

In [77]:
#some preprocesing - creating dictionaries to add back in data, after I turned the list of names into a string to remove duplicates
#the original data we imported above was lost so we readd it by creating dictionaries
cumulative_CD['str name'] = cumulative_CD['full name'].apply(lambda x: str(x)) + "____" + cumulative_CD['state']
fullnamedict = dict(zip(cumulative_CD['str name'], cumulative_CD['full name']))
statenamedict = dict(zip(cumulative_CD['str name'], cumulative_CD['state']))
statedebtnamedict = dict(zip(cumulative_CD['str name'], cumulative_CD['debt state']))
sslodict = dict(zip(cumulative_CD['str name'], cumulative_CD['Same State Loan Office']))
dslodict = dict(zip(cumulative_CD['str name'], cumulative_CD['Different State Loan Office']))
sslddict = dict(zip(cumulative_CD['str name'], cumulative_CD['Same State Liquidated Debt']))
dslddict = dict(zip(cumulative_CD['str name'], cumulative_CD['Different State Liquidated Debt']))
pcdict = dict(zip(cumulative_CD['str name'], cumulative_CD['Pierce Certificates']))

In [78]:
#use dictionaries to link data to matched values for each loan type
cumulative_CD_assets = cumulative_CD.groupby('str name')['6p_Cents','6p_Dollar',
                                                         '6p_def_Cents','6p_def_Dollar'].sum()
cumulative_CD_assets.reset_index(inplace = True)
cumulative_CD_assets['full name'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                           fullnamedict[x])
cumulative_CD_assets['state'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                       statenamedict[x])
cumulative_CD_assets['state debt'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                       statedebtnamedict[x])
cumulative_CD_assets['Same State Loan Office'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                                        sslodict[x])
cumulative_CD_assets['Different State Loan Office'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                                             dslodict[x])
cumulative_CD_assets['Same State Liquidated Debt'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                                            sslddict[x])
cumulative_CD_assets['Different State Liquidated Debt'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                                                 dslddict[x])
cumulative_CD_assets['Pierce Certificates'] = cumulative_CD_assets['str name'].apply(lambda x: 
                                                                                     pcdict[x])
cumulative_CD_assets['Total'] = (cumulative_CD_assets['6p_def_Cents'] + 
                                 cumulative_CD_assets['6p_Cents'])/100 + (cumulative_CD_assets['6p_Dollar'] + 
                                                                          cumulative_CD_assets['6p_def_Dollar'])
cumulative_CD_assets.drop(['6p_def_Cents', '6p_Cents', 
                           '6p_Dollar', '6p_def_Dollar', 'str name'], axis = 1, inplace = True)

In [79]:
cumulative_CD_assets

Unnamed: 0,full name,state,state debt,Same State Loan Office,Different State Loan Office,Same State Liquidated Debt,Different State Liquidated Debt,Pierce Certificates,Total
0,[Bernard O'Neill],MD,MD,[Bernard O'Neill],,,,,230.31
1,[Francis O'Neill],MD,MD,[Francis O'Neill],[Francis O Neill],,,,37.96
2,[Henry O'Neale],MD,MD,,,,,[Henry O'Neal],99.77
3,[James O'Hara],PA,PA,,,,,,33.44
4,[ Agness ann],PA,PA,,,,,,883.63
...,...,...,...,...,...,...,...,...,...
2532,[Zebulon Waterman],CT,CT,,,,,,306.61
2533,[Zephaniah Andrews],RI,RI,,,,,,1728.74
2534,[Zephaniah Brown],RI,RI,,,,,,2415.08
2535,[Zephaniah Davis],CT,CT,,,,,,77.20


# Adding the different asset totals from each loan type

### Same State Loan Office

In [80]:
#add asset counts for each individual on each row to the state loan office table for the same state merging method
def ssloTotal(ind):
    name_options = cumulative_CD_assets.loc[ind, 'Same State Loan Office']
    state = cumulative_CD_assets.loc[ind, 'state debt']
    state_office = loan_office[loan_office['State Name'] == state]
    ind1 = state_office[state_office['Full Name 1'].apply(lambda x: x in name_options)].index.tolist()
    ind2 = state_office[state_office['Full Name 2'].apply(lambda x: x in name_options)].index.tolist()
    ind3 = state_office[state_office['Full Name 3'].apply(lambda x: x in name_options)].index.tolist()
    ind1.extend(ind2)
    ind1.extend(ind3)
    total_val = state_office.loc[ind1]['Specie Value '].sum()
    return total_val

In [81]:
cumulative_CD_assets['SSLO Total'] = np.nan
ssloIndex = cumulative_CD_assets[cumulative_CD_assets['Same State Loan Office'].apply(lambda x: type(x) == list)].index
cumulative_CD_assets.loc[ssloIndex, 'SSLO Total'] = [ssloTotal(x) for x in ssloIndex]
cumulative_CD_assets['SSLO Total'] = cumulative_CD_assets['SSLO Total'].fillna(0)

### Different State Loan Office

In [82]:
#add asset counts for each individual on each row to the state loan office table for the dif state merging method
def dsloTotal(ind):
    name_options = cumulative_CD_assets.loc[ind, 'Different State Loan Office']
    state = cumulative_CD_assets.loc[ind, 'state debt']    
    state_office = loan_office[loan_office['State Name'] != state]
    ind1 = state_office[state_office['Full Name 1'].apply(lambda x: x in name_options)].index.tolist()
    ind2 = state_office[state_office['Full Name 2'].apply(lambda x: x in name_options)].index.tolist()
    ind3 = state_office[state_office['Full Name 3'].apply(lambda x: x in name_options)].index.tolist()
    ind1.extend(ind2)
    ind1.extend(ind3)
    total_val = state_office.loc[ind1]['Specie Value '].sum()
    return total_val

In [83]:
cumulative_CD_assets['DSLO Total'] = np.nan
dsloIndex = cumulative_CD_assets[cumulative_CD_assets['Different State Loan Office'].apply(lambda x: 
                                                                                           type(x) == list)].index
cumulative_CD_assets.loc[dsloIndex, 'DSLO Total'] = [dsloTotal(x) for x in dsloIndex]
cumulative_CD_assets['DSLO Total'] = cumulative_CD_assets['DSLO Total'].fillna(0)

### Same State Liquidated Debt Certificates

In [84]:
#function to add values of liquidated debt certificates
def ssldTotal(ind):
    name_options = cumulative_CD_assets.loc[ind, 'Same State Liquidated Debt']
    state = cumulative_CD_assets.loc[ind, 'state debt']
    
    if state != 'PA' and state in state_name.keys():
        num_names = state_name[state]
        state_certs_file = '../Data/Pre1790/cleaned/liquidated_debt_certificates_'+state+'_cleaned.csv'
        if exists(state_certs_file):
            total_val = calculateTotalValue(state_certs_file, num_names, name_options)
            return total_val
    elif state == 'PA':
        state_certs_file1 = '../Data/Pre1790/cleaned/liquidated_debt_certificates_PA_story_cleaned.csv'
        total_val1 = calculateTotalValue(state_certs_file1, 1, name_options)
        state_certs_file2 = '../Data/Pre1790/cleaned/liquidated_debt_certificates_PA_stelle_cleaned.csv'
        total_val2 = calculateTotalValue(state_certs_file2, 2, name_options)
        return total_val1 + total_val2
    return 0

In [85]:
#calculate total value held by one person in debt certificates from a particular state
def calculateTotalValue(file, num_names, name_options):
    #this part is pretty similar to the merging part for liquidated debt certificates
    state_cert = pd.read_csv(file, index_col = 0)
    state_cert['Full Name'] = state_cert['First name'] + " " + state_cert['Last name'] 
    namelst = []
    namelst.append('Full Name')
    if num_names > 1:
        for i in np.arange(2, num_names+1, 1):
            fullname_str = 'Full Name ' + str(i)
            state_cert[fullname_str] = state_cert['First name ' + str(i)] + " " + state_cert['Last name ' + str(i)] 
            namelst.append(fullname_str)
    ind = []
    for name in namelst:
        ind.extend(state_cert[state_cert[name].apply(lambda x: x in name_options)].index.tolist())
    #create subtable for the data we want, make it into a numeric value and sum it
    subtbl = state_cert.loc[ind]
    subtbl['Dollars'] = subtbl['Dollars'].apply(lambda x: float(x))
    subtbl['90th'] = subtbl['90th'].apply(lambda x: float(x) if x != '22/8' else 22/8)
    total_val = subtbl['Dollars'].sum() + subtbl['90th'].sum()/90
    return total_val

In [86]:
cumulative_CD_assets['SSLD Total'] = np.nan
ssldIndex = cumulative_CD_assets[cumulative_CD_assets['Same State Liquidated Debt'].apply(lambda x: 
                                                                                          type(x) == list)].index
cumulative_CD_assets.loc[ssldIndex, 'SSLD Total'] = [ssldTotal(x) for x in ssldIndex]
cumulative_CD_assets['SSLD Total'] = cumulative_CD_assets['SSLD Total'].fillna(0)

### Different State Liquidated Debt Certificates

In [87]:
#function to add values of liquidated debt certificates
def dsldTotal(ind):
    name_options = cumulative_CD_assets.loc[ind, 'Different State Liquidated Debt']
    state = cumulative_CD_assets.loc[ind, 'state debt']
    sumval = 0
    for statename in states:
        if statename != 'PA' and not pd.isnull(state_name[statename]):
            num_names = state_name[statename]
            state_certs_file = '../Data/Pre1790/cleaned/liquidated_debt_certificates_'+statename+'_cleaned.csv'
            if exists(state_certs_file):
                total_val = calculateTotalValue(state_certs_file, 
                                                num_names, name_options)
            sumval = sumval + total_val
        elif statename == 'PA':
            state_certs_file1 = '../Data/Pre1790/cleaned/liquidated_debt_certificates_PA_story_cleaned.csv'
            total_val1 = calculateTotalValue(state_certs_file1, 
                                             1, name_options)
            state_certs_file2 = '../Data/Pre1790/cleaned/liquidated_debt_certificates_PA_stelle_cleaned.csv'
            total_val2 = calculateTotalValue(state_certs_file2, 
                                             2, name_options)
            sumval = sumval + total_val1 + total_val2
    return sumval

In [88]:
#calculate total value held by one person in debt certificates from a particular state
def calculateTotalValue(file, num_names, name_options):
    #this part is pretty similar to the merging part for liquidated debt certificates
    state_cert = pd.read_csv(file, index_col = 0)
    state_cert['Full Name'] = state_cert['First name'] + " " + state_cert['Last name'] 
    namelst = []
    namelst.append('Full Name')
    if num_names > 1:
        for i in np.arange(2, num_names+1, 1):
            fullname_str = 'Full Name ' + str(i)
            state_cert[fullname_str] = state_cert['First name ' + str(i)] + " " + state_cert['Last name ' + str(i)] 
            namelst.append(fullname_str)
    ind = []
    for name in namelst:
        ind.extend(state_cert[state_cert[name].apply(lambda x: x in name_options)].index.tolist())
    #create subtable for the data we want, make it into a numeric value and sum it
    subtbl = state_cert.loc[ind]
    subtbl['Dollars'] = subtbl['Dollars'].apply(lambda x: float(x))
    subtbl['90th'] = subtbl['90th'].apply(lambda x: float(x) if x != '22/8' else 22/8)
    total_val = subtbl['Dollars'].sum() + subtbl['90th'].sum()/90
    return total_val

In [89]:
cumulative_CD_assets['DSLD Total'] = np.nan
dsldIndex = cumulative_CD_assets[cumulative_CD_assets['Different State Liquidated Debt'].apply(lambda x:
                                                                                               type(x) == list)].index
cumulative_CD_assets.loc[dsldIndex, 'DSLD Total'] = [dsldTotal(x) for x in dsldIndex]
cumulative_CD_assets['DSLD Total'] = cumulative_CD_assets['DSLD Total'].fillna(0)

### Same State Pierce Certificates

In [90]:
#calculate sum for pierce certificates
def pcTotal(ind):
    name_options = cumulative_CD_assets.loc[ind, 'Pierce Certificates']
    state = cumulative_CD_assets.loc[ind, 'state debt']
    
    pierce_state = pierce[pierce['State'].apply(lambda x: 
                                                pd.isnull(x) or x == state)]
    ind1 = pierce_state[pierce_state['Full Name'].apply(lambda x: 
                                                        x in name_options)].index.tolist()
    ind2 = pierce_state[pierce_state['Full Name 2'].apply(lambda x: 
                                                          x in name_options)].index.tolist()
    ind1.extend(ind2)
    total_val = pierce_state.loc[ind1]['Value'].sum()
    return total_val

In [91]:
cumulative_CD_assets['PC Total'] = np.nan
pcIndex = cumulative_CD_assets[cumulative_CD_assets['Pierce Certificates'].apply(lambda x: 
                                                                                 type(x) == list)].index
cumulative_CD_assets.loc[pcIndex, 'PC Total'] = [pcTotal(x) for x in pcIndex]
cumulative_CD_assets['PC Total'] = cumulative_CD_assets['PC Total'].fillna(0)

In [92]:
cumulative_CD_assets['Debt Total'] = cumulative_CD_assets[['SSLO Total','DSLO Total',
                                                           'SSLD Total', 'DSLD Total',
                                                           'PC Total']].sum(axis = 1)

In [93]:
pre1790_certs = ['Same State Loan Office','Different State Loan Office', 
                 'Same State Liquidated Debt','Different State Liquidated Debt','Pierce Certificates']
cumulative_CD_assets['tot_pre1790_certs'] = 5 - cumulative_CD_assets[pre1790_certs].isna().sum(axis = 1)

In [94]:
cumulative_CD_assets.to_csv("prepost_matched_debt_files.csv")

In [98]:
matched_names = pd.concat([df_loanoffice_samestate, df_loanoffice_difstate, df_samestateliquid,
                           df_difstateliquid, df_pierce])
matched_names = matched_names[matched_names['Scores'] != 100][['CD name','Loan Office name', 'state']].drop_duplicates()
matched_names.to_csv("../Data/total_matching_post1790.csv")