<h3> Data Cleaning </h3>

In [1]:
import numpy as np
import pandas as pd
import rapidfuzz
from rapidfuzz import process

In [102]:
#importing cleaned delegate names
delegates = pd.read_csv("../Data/Delegates/cleaned/constitutional_convention_delegates_cleaned.csv").drop('Unnamed: 0', axis = 1)
state_delegates = pd.read_csv("../Data/Delegates/cleaned/State_Delegates_cleaned.csv").drop('Unnamed: 0', axis = 1)

# Algorithm for Name Matching

1. Run fuzzy match with score threshold of 80
2. Get unique set of names for each pair, get rid of ???
    1. If one word for either, remove match
    2. Else run word comparison - if there are min(n, m) matches (above 90) then keep match else discard
3. Go through manually

In [103]:
def fuzzy_merge(lst1, lst2, threshold=85, limit = 100):
    """
    :param df_1: the left list to join
    :param df_2: the right list to join
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    
    delegates = pd.Series([x for x in lst1.unique() if not pd.isnull(x)])
    possible = [x for x in lst2.unique().tolist() if not pd.isnull(x)]
    
    #get matches
    #process.extract uses a combination of all four fuzzywuzzy scores
    matches = delegates.apply(lambda x: process.extract(x, possible, limit=limit))
    
    match_df = pd.DataFrame(columns = ['Delegates', 'Loan Matches'])
    
    for delegate, matchset in zip(delegates, matches):
        matchset_thres = [name for name in matchset if name[1] >= threshold]
        if len(matchset_thres) == 0:
            add_df = pd.DataFrame(data = {'Delegates': [delegate], 'Loan Matches': [""], 'Scores': [0]})
            match_df = pd.concat([match_df, add_df])
        else:
            delegate_lst = [delegate] * len(matchset_thres)
            add_df = pd.DataFrame(data = {'Delegates': delegate_lst, 
                                          'Loan Matches': [x[0] for x in matchset_thres],
                                          'Scores': [x[1] for x in matchset_thres]})
            match_df = pd.concat([match_df, add_df])

    return match_df

In [104]:
#function for performing the second step of the match
def matchFunction(lst1, lst2, score = 90):
    #check if our matches are actually min 2 words each
    #make sure our match is because the individual are similar, not because the phrase or one word in the phrase is similar
    #lst1 = list(pd.Series(lst1).unique())
    #lst2 = list(pd.Series(lst2).unique())
    threshold = min(len(lst1), len(lst2))
    matches = 0
    nomatch = []
    i = 0
    for wd1 in lst1:
        #modifying which words we compare - dont want to compare first in lst1 with last in lst2
        for wd2 in lst2:
            if wd1 not in nomatch and process.extract(wd1, [wd2])[0][1] > score:
                matches+=1
                nomatch.append(wd1)
        i+=1
    return matches >= threshold

## Matching CT Data

### Matching on Constitutional Convention

In [105]:
def produceMatches(delegates, debt, delegate_names, debt_names):
    initial = True
    join_df = pd.DataFrame()
    for del_name in delegate_names:
        for debt_name in debt_names:
            if initial:
                join_df = fuzzy_merge(delegates[del_name], debt[debt_name])
                initial = False
            else:
                add_df = fuzzy_merge(delegates[del_name], debt[debt_name])
                join_df = pd.concat([join_df, add_df])
    join_df = join_df.drop_duplicates().reset_index(drop = True)
    join_df = join_df[join_df['Scores'].apply(lambda x: x != 0)]
    join_df = join_df[join_df['Loan Matches'].apply(lambda x: not pd.isnull(x))]    
    
    join_df_p2 = join_df[join_df['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
    join_df_p2_final = join_df_p2[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(join_df_p2['Delegates'], join_df_p2['Loan Matches'])]]
    #select only the highest scoring loan match name pairing 
    join_df_p2_final.sort_values(by = 'Scores', ascending = False, inplace = True)
    join_df_p2_final_ind = join_df_p2_final[['Delegates','Loan Matches']].drop_duplicates().index
    join_df_p2_final = join_df_p2_final.loc[join_df_p2_final_ind]
    
    return join_df_p2_final

In [106]:
#separate a string that contains two names into a list of two names
def parseNames(x):
    #replace words that don't have meaning
    x = x.replace("and Co", "").replace("and co", "").replace("and Others" ,"")
    x = x.replace("and others", "").replace("and Son", "").replace("and Sons", "")
    x = x.replace("and Brothers", "").strip()
    #string preprocessing
    namelst = x.split(" and ")
    namelst = [x.strip() for x in namelst if x.strip() != ""]
    if len(namelst) > 1:
        wd1len = len(namelst[0].split(" "))
        wd2len = len(namelst[1].split(" "))
        #add last name
        if wd1len == 1 and wd2len != 1:
            namelst[0] = namelst[0] + " " + namelst[1].split(" ")[-1]
    return namelst

In [107]:
#prepare loan dataset
CT_ASD = pd.read_excel("../Data/Post1790/CT/CT_post1790_ASD_ledger.xlsx", header = 13, usecols = 'H, I, X, Y, AN, AO')
CT_ASD['full name 1'] = CT_ASD['First Name'] + " " + CT_ASD['Last Name']
CT_ASD['full name 2'] = CT_ASD['First Name.1'] + " " + CT_ASD['Last Name.1']
CT_ASD['full name 3'] = CT_ASD['First Name.2'] + " " + CT_ASD['Last Name.2']
CT_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(CT_ASD['full name 1'], CT_ASD['full name 2'])]
CT_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(CT_ASD['full name'], CT_ASD['full name 3'])]

In [108]:
#name matching for constitutional convention and state convention to debt list
CT_ASD_const = produceMatches(delegates, CT_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
CT_ASD_state = produceMatches(state_delegates, CT_ASD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [109]:
CT_ASD_const['state'] = 'CT'
CT_ASD_state['state'] = 'CT'

In [110]:
#prepare loan dataset
CT_CD = pd.read_excel("../Data/Post1790/CT/CT_post1790_CD_ledger.xlsx", header = 13, usecols = 'H, I, X, Y, AN, AO')
CT_CD['full name 1'] = CT_CD['First Name'] + " " + CT_CD['Last Name']
CT_CD['full name 2'] = CT_CD['First Name.1'] + " " + CT_CD['Last Name.1']
CT_CD['full name 3'] = CT_CD['First Name.2'] + " " + CT_CD['Last Name.2']
CT_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(CT_CD['full name 1'], CT_CD['full name 2'])]
CT_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(CT_CD['full name'], CT_CD['full name 3'])]

In [111]:
#name matching for constitutional convention and state convention to debt list
CT_CD_const = produceMatches(delegates, CT_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
CT_CD_state = produceMatches(state_delegates, CT_CD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [112]:
CT_CD_const['state'] = 'CT'
CT_CD_state['state'] = 'CT'

In [113]:
#master dataset
cumulative_matching_const = pd.concat([CT_ASD_const, CT_CD_const]).drop_duplicates()
cumulative_matching_state = pd.concat([CT_ASD_state, CT_CD_state]).drop_duplicates()

# Georgia Loan Office Certificates

In [114]:
#prepare pierce certificates
GA = pd.read_excel("../Data/Post1790/GA/T694_GA_Loan_Office_CD.xlsx", header = 9, usecols = 'Q, R')

#create full name
GA['full name 1'] = GA['First Name'] + " " + GA['Last Name']

In [115]:
#name matching for constitutional convention and state convention to debt list
GA_const = produceMatches(delegates, GA, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name 1'])
GA_state = produceMatches(state_delegates, GA, delegate_names = ['full name 1'], debt_names = ['full name 1'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [116]:
GA_const['state'] = 'GA'
GA_const['state'] = 'GA'

In [117]:
cumulative_matching_const = pd.concat([cumulative_matching_const, GA_const]).drop_duplicates()

In [118]:
cumulative_matching_state = pd.concat([cumulative_matching_state, GA_state]).drop_duplicates()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


# Maryland Debt Certificates

In [119]:
#prepare loan dataset
MD_ASD = pd.read_excel("../Data/Post1790/MD/MD_post1790_ASD.xlsx", header = 10, usecols = 'G, H, U, V, AI, AJ')
MD_ASD = MD_ASD.rename(columns={'Unnamed: 34': 'First Name.2', 'Unnamed: 35':'Last Name.2'})
MD_ASD.drop(0, inplace = True)
MD_ASD['full name 1'] = MD_ASD['First Name'] + " " + MD_ASD['Last Name']
MD_ASD['full name 2'] = MD_ASD['First Name.1'] + " " + MD_ASD['Last Name.1']
MD_ASD['full name 3'] = MD_ASD['First Name.2'] + " " + MD_ASD['Last Name.2']
MD_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(MD_ASD['full name 1'], MD_ASD['full name 2'])]
MD_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(MD_ASD['full name'], MD_ASD['full name 3'])]

In [120]:
#name matching for constitutional convention and state convention to debt list
MD_ASD_const = produceMatches(delegates, MD_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
MD_ASD_state = produceMatches(state_delegates, MD_ASD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [121]:
MD_ASD_const['state'] = 'MD'
MD_ASD_state['state'] = 'MD'

In [122]:
#prepare loan dataset
MD_CD = pd.read_excel("../Data/Post1790/MD/MD_post1790_CD.xlsx", header = 10, usecols = 'G, H, U, V, AI, AJ')
MD_CD = MD_CD.rename(columns={'Unnamed: 34': 'First Name.2', 'Unnamed: 35':'Last Name.2'})
MD_CD.drop(0, inplace = True)
MD_CD['full name 1'] = MD_CD['First Name'] + " " + MD_CD['Last Name']
MD_CD['full name 2'] = MD_CD['First Name.1'] + " " + MD_CD['Last Name.1']
MD_CD['full name 3'] = MD_CD['First Name.2'].apply(lambda x: str(x)) + " " + MD_CD['Last Name.2'].apply(lambda x: str(x))
MD_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(MD_CD['full name 1'], MD_CD['full name 2'])]
MD_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(MD_CD['full name'], MD_CD['full name 3'])]

In [123]:
#name matching for constitutional convention and state convention to debt list
MD_CD_const = produceMatches(delegates, MD_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
MD_CD_state = produceMatches(state_delegates, MD_CD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [124]:
MD_CD_const['state'] = 'MD'
MD_CD_state['state'] = 'MD'

In [125]:
cumulative_matching_const = pd.concat([cumulative_matching_const,MD_ASD_const, MD_CD_const]).drop_duplicates()

In [126]:
cumulative_matching_state = pd.concat([cumulative_matching_state, MD_ASD_state,MD_CD_state]).drop_duplicates()

# North Carolina

In [127]:
#Read in file
NC_ASD = pd.read_excel("../Data/Post1790/NC/T695_R3_NC_ASD.xlsx", header = 9, usecols = 'H, I')

#create full name
NC_ASD['full name 1'] = NC_ASD['First Name'] + " " + NC_ASD['Last Name']

In [128]:
#name matching for constitutional convention and state convention to debt list
NC_ASD_const = produceMatches(delegates, NC_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name 1'])
NC_ASD_state = produceMatches(state_delegates, NC_ASD, delegate_names = ['full name 1'], debt_names = ['full name 1'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [129]:
#name matching for constitutional convention and state convention to debt list
NC_ASD_const['state'] = 'NC'
NC_ASD_state['state'] = 'NC'

In [130]:
#Read in file
NC_CD = pd.read_excel("../Data/Post1790/NC/T695_R4_NC_CD.xlsx", header = 10, usecols = 'J, K')

#create full name
NC_CD['full name 1'] = NC_CD['First Name'] + " " + NC_CD['Last Name']

In [131]:
#name matching for constitutional convention and state convention to debt list
NC_CD_const = produceMatches(delegates, NC_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name 1'])
NC_CD_state = produceMatches(state_delegates, NC_CD, delegate_names = ['full name 1'], debt_names = ['full name 1'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [132]:
#name matching for constitutional convention and state convention to debt list
NC_CD_const['state'] = 'NC'
NC_CD_state['state'] = 'NC'

In [133]:
cumulative_matching_const = pd.concat([cumulative_matching_const, NC_ASD_const, NC_CD_const]).drop_duplicates()

In [134]:
cumulative_matching_state = pd.concat([cumulative_matching_state, NC_ASD_state, NC_CD_state]).drop_duplicates()

# New Hampshire

In [135]:
#Read in file
NH_ASD = pd.read_excel("../Data/Post1790/NH/T652_New_Hampshire_ASD.xlsx", header = 10, usecols = 'G, H, V, W, AK, AL')

#create full name
NH_ASD['full name 1'] = NH_ASD['First Name'] + " " + NH_ASD['Last Name']
NH_ASD['full name 2'] = NH_ASD['First Name.1'] + " " + NH_ASD['Last Name.1']
NH_ASD['full name 3'] = NH_ASD['First Name.2'] + " " + NH_ASD['Last Name.2']
NH_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(NH_ASD['full name 1'], NH_ASD['full name 2'])]
NH_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(NH_ASD['full name'], NH_ASD['full name 3'])]

In [136]:
#name matching for constitutional convention and state convention to debt list
NH_ASD_const = produceMatches(delegates, NH_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
NH_ASD_state = produceMatches(state_delegates, NH_ASD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [137]:
NH_ASD_const['state'] = 'NH'
NH_ASD_state['state'] = 'NH'

In [138]:
#Read in file
NH_CD = pd.read_excel("../Data/Post1790/NH/T652_R6_New_Hampshire_CD.xlsx", header = 10, usecols = 'I, J')

#create full name
NH_CD['full name 1'] = NH_CD['First Name'] + " " + NH_CD['Last Name']

In [139]:
#name matching for constitutional convention and state convention to debt list
NH_CD_const = produceMatches(delegates, NH_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name 1'])
NH_CD_state = produceMatches(state_delegates, NH_CD, delegate_names = ['full name 1'], debt_names = ['full name 1'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [140]:
NH_CD_const['state'] = 'NH'
NH_CD_state['state'] = 'NH'

In [141]:
cumulative_matching_const = pd.concat([cumulative_matching_const, NH_ASD_const, NH_CD_const]).drop_duplicates()

In [142]:
cumulative_matching_state = pd.concat([cumulative_matching_state, NH_ASD_state, NH_CD_state]).drop_duplicates()

# New York

In [143]:
#Read in file
NY_ASD = pd.read_excel("../Data/Post1790/NY/NY_1790_ASD.xlsx", header = 10, usecols = 'H, I, X, Y, AM, AN')

#create full name
NY_ASD['full name 1'] = NY_ASD['First Name'] + " " + NY_ASD['Last Name']
NY_ASD['full name 2'] = NY_ASD['First Name.1'] + " " + NY_ASD['Last Name.1']
NY_ASD['full name 3'] = NY_ASD['First Name.2'] + " " + NY_ASD['Last Name.2']
NY_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(NY_ASD['full name 1'], NY_ASD['full name 2'])]
NY_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(NY_ASD['full name'], NY_ASD['full name 3'])]

In [144]:
#name matching for constitutional convention and state convention to debt list
NY_ASD_const = produceMatches(delegates, NY_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
NY_ASD_state = produceMatches(state_delegates, NY_ASD, delegate_names= ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [145]:
NY_ASD_const['state'] = 'NY'
NY_ASD_state['state'] = 'NY'

In [146]:
#Read in file
NY_CD = pd.read_excel("../Data/Post1790/NY/NY_1790_CD.xlsx", header = 10, usecols = 'H, I, X, Y, AM, AN')

#create full name
NY_CD['full name 1'] = NY_CD['First Name'] + " " + NY_CD['Last Name']
NY_CD['full name 2'] = NY_CD['First Name.1'] + " " + NY_CD['Last Name.1']
NY_CD['full name 3'] = NY_CD['First Name.2'] + " " + NY_CD['Last Name.2']
NY_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(NY_CD['full name 1'], NY_CD['full name 2'])]
NY_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(NY_CD['full name'], NY_CD['full name 3'])]

In [147]:
#name matching for constitutional convention and state convention to debt list
NY_CD_const = produceMatches(delegates, NY_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
NY_CD_state = produceMatches(state_delegates, NY_CD, delegate_names= ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [148]:
NY_CD_const['state'] = 'NY'
NY_CD_state['state'] = 'NY'

In [149]:
cumulative_matching_const = pd.concat([cumulative_matching_const, NY_ASD_const, NY_CD_const]).drop_duplicates()

In [150]:
cumulative_matching_state = pd.concat([cumulative_matching_state, NY_ASD_state, NY_CD_state]).drop_duplicates()

# Pennsylvania

In [151]:
#Read in file
PA = pd.read_excel("../Data/Post1790/PA/PA_post1790_CD.xlsx", header = 11, usecols = 'G, H, U, V, AJ, AK')

#create full name
PA['full name 1'] = PA['First Name'] + " " + PA['Last Name']
PA['full name 2'] = PA['First Name.1'] + " " + PA['Last Name.1']
PA['full name 3'] = PA['First Name.2'] + " " + PA['Last Name.2']
PA['full name'] = [y if pd.isnull(x) else x for x, y in zip(PA['full name 1'], PA['full name 2'])]
PA['full name'] = [y if pd.isnull(x) else x for x, y in zip(PA['full name'], PA['full name 3'])]

In [152]:
#name matching for constitutional convention and state convention to debt list
PA_const = produceMatches(delegates, PA, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
PA_state = produceMatches(state_delegates, PA, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [153]:
PA_const['state'] = 'PA'
PA_state['state'] = 'PA'

In [154]:
cumulative_matching_const = pd.concat([cumulative_matching_const, PA_const]).drop_duplicates()

In [155]:
cumulative_matching_state = pd.concat([cumulative_matching_state, PA_state]).drop_duplicates()

# Rhode Island

In [156]:
#Read in file
RI_ASD = pd.read_excel("../Data/Post1790/RI/T653_Rhode_Island_ASD.xlsx", header = 10, usecols = 'H, I, X, Y, AN, AO')

#create full name
RI_ASD['full name 1'] = RI_ASD['First Name'] + " " + RI_ASD['Last Name']
RI_ASD['full name 2'] = RI_ASD['First Name.1'] + " " + RI_ASD['Last Name.1']
RI_ASD['full name 3'] = RI_ASD['First Name.2'] + " " + RI_ASD['Last Name.2']
RI_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(RI_ASD['full name 1'], RI_ASD['full name 2'])]
RI_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(RI_ASD['full name'], RI_ASD['full name 3'])]

In [157]:
#name matching for constitutional convention and state convention to debt list
RI_ASD_const = produceMatches(delegates, RI_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
RI_ASD_state = produceMatches(state_delegates, RI_ASD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [158]:
RI_ASD_const['state'] = 'RI'
RI_ASD_state['state'] = 'RI'

In [159]:
#Read in file
RI_CD = pd.read_excel("../Data/Post1790/RI/T653_Rhode_Island_CD.xlsx", header = 10, usecols = 'G, H, U, V, AI, AJ')

#create full name
RI_CD['full name 1'] = RI_CD['First Name'] + " " + RI_CD['Last Name']
RI_CD['full name 2'] = RI_CD['First Name.1'] + " " + RI_CD['Last Name.1']
RI_CD['full name 3'] = RI_CD['First Name.2'] + " " + RI_CD['Last Name.2']
RI_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(RI_CD['full name 1'], RI_CD['full name 2'])]
RI_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(RI_CD['full name'], RI_CD['full name 3'])]

In [160]:
#name matching for constitutional convention and state convention to debt list
RI_CD_const = produceMatches(delegates, RI_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
RI_CD_state = produceMatches(state_delegates, RI_CD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [161]:
RI_CD_const['state'] = 'RI'
RI_CD_state['state'] = 'RI'

In [162]:
cumulative_matching_const = pd.concat([cumulative_matching_const, RI_ASD_const, RI_CD_const]).drop_duplicates()

In [163]:
cumulative_matching_state = pd.concat([cumulative_matching_state, RI_ASD_state, RI_CD_state]).drop_duplicates()

# South Carolina

In [164]:
#Read in file
SC_ASD = pd.read_excel("../Data/Post1790/SC/Post_1790_South_Carolina_ASD_transfers_removed.xlsx", header = 11, usecols = 'D, E')
#create full name
SC_ASD['full name 1'] = SC_ASD['First Name'] + " " + SC_ASD['Last Name']

In [165]:
#name matching for constitutional convention and state convention to debt list
SC_ASD_const = produceMatches(delegates, SC_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name 1'])
SC_ASD_state = produceMatches(state_delegates, SC_ASD, delegate_names = ['full name 1'], debt_names = ['full name 1'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [166]:
SC_ASD_const['state'] = 'SC'
SC_ASD_state['state'] = 'SC'

In [167]:
#Read in file
SC_CD = pd.read_excel("../Data/Post1790/SC/Post_1790_South_Carolina_CD.xlsx", header = 11, usecols = 'D, E, S, T, AH, AI')

#create full name
SC_CD['full name 1'] = SC_CD['First Name'] + " " + SC_CD['Last Name']
SC_CD['full name 2'] = SC_CD['First Name.1'] + " " + SC_CD['Last Name.1']
SC_CD['full name 3'] = SC_CD['First Name.2'] + " " + SC_CD['Last Name.2']
SC_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(SC_CD['full name 1'], SC_CD['full name 2'])]
SC_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(SC_CD['full name'], SC_CD['full name 3'])]

In [168]:
#name matching for constitutional convention and state convention to debt list
SC_CD_const = produceMatches(delegates, SC_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
SC_CD_state = produceMatches(state_delegates, SC_CD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [169]:
SC_CD_const['state'] = 'SC'
SC_CD_state['state'] = 'SC'

In [170]:
cumulative_matching_const = pd.concat([cumulative_matching_const, SC_CD_const, SC_ASD_const]).drop_duplicates()

In [171]:
cumulative_matching_state = pd.concat([cumulative_matching_state, SC_CD_state, SC_ASD_state]).drop_duplicates()

# Virginia

In [172]:
#Read in file
VA_ASD = pd.read_excel("../Data/Post1790/VA/VA_ASD.xlsx", header = 10, usecols = 'D, E, U, V, AL, AM')
VA_ASD.rename(columns = {'Unnamed: 3':'First Name.0', 'Unnamed: 4':'Last Name.0'}, inplace = True)
VA_ASD.drop(0, inplace = True)
#create full name
VA_ASD['full name 1'] = VA_ASD['First Name.0'] + " " + VA_ASD['Last Name.0']
VA_ASD['full name 2'] = VA_ASD['First Name'] + " " + VA_ASD['Last Name']
VA_ASD['full name 3'] = VA_ASD['First Name.1'] + " " + VA_ASD['Last Name.1']
VA_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(VA_ASD['full name 1'], VA_ASD['full name 2'])]
VA_ASD['full name'] = [y if pd.isnull(x) else x for x, y in zip(VA_ASD['full name'], VA_ASD['full name 3'])]

In [173]:
#name matching for constitutional convention and state convention to debt list
VA_ASD_const = produceMatches(delegates, VA_ASD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
VA_ASD_state = produceMatches(state_delegates, VA_ASD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [174]:
VA_ASD_const['state'] = 'VA'
VA_ASD_state['state'] = 'VA'

In [175]:
#Read in file
VA_CD = pd.read_excel("../Data/Post1790/VA/VA_CD.xlsx", header = 11, usecols = 'H, I, U, V, AH, AI')
VA_CD.rename(columns = {'Unnamed: 7':'First Name.0', 'Unnamed: 8':'Last Name.0'}, inplace = True)
VA_CD.drop(0, inplace = True)
#create full name
VA_CD['full name 1'] = VA_CD['First Name.0'] + " " + VA_CD['Last Name.0']
VA_CD['full name 2'] = VA_CD['First Name'] + " " + VA_CD['Last Name']
VA_CD['full name 3'] = VA_CD['First Name.1'] + " " + VA_CD['Last Name.1']
VA_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(VA_CD['full name 1'], VA_CD['full name 2'])]
VA_CD['full name'] = [y if pd.isnull(x) else x for x, y in zip(VA_CD['full name'], VA_CD['full name 3'])]

In [176]:
#name matching for constitutional convention and state convention to debt list
VA_CD_const = produceMatches(delegates, VA_CD, delegate_names = ['full name 1', 'full name 2'], debt_names = ['full name'])
VA_CD_state = produceMatches(state_delegates, VA_CD, delegate_names = ['full name 1'], debt_names = ['full name'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [177]:
VA_CD_const['state'] = 'VA'
VA_CD_state['state'] = 'VA'

In [178]:
cumulative_matching_const = pd.concat([cumulative_matching_const, VA_ASD_const, VA_CD_const]).drop_duplicates()

In [179]:
cumulative_matching_state = pd.concat([cumulative_matching_state, VA_ASD_state, VA_CD_state]).drop_duplicates()

# Final Steps

In [180]:
cum_matching = pd.concat([cumulative_matching_const, cumulative_matching_state]).drop_duplicates()

In [181]:
cum_matching = cum_matching.sort_values(['Delegates', 'Loan Matches', 'Scores'])

In [182]:
cum_matching.to_csv('../Data/final_matching_post1790.csv')