# Outline
1. Import continental and assumed debt for each state
2. Map towns to county names using mapping file + fuzzy string matching
3. Use county shape files to create maps

## Documentation Notes
1. All non-exact matches will be printed out when doing merges (exact matches are not printed)

## Cleaning Questions
1. What do we do about an entry that has entries for some debt types, but not for others? Do we want to drop those entries, impute the values, or set NA values to 0?
2. What do we do about entries that have a state, but no town (ie: State of Connecticut)? What about no state?

In [155]:
import numpy as np
import pandas as pd
from rapidfuzz import process

In [100]:
def combineCols(df):
    for col in ['town', 'state', 'occupation']:
        # " ".join(x.split()) removes all excess whitespace
        # creates set with all unique instances of column
        df[col] = [set([" ".join(x.split()) for x in [t1, t2, t3] if not pd.isnull(x)]) for t1, t2, t3 in zip(df[col+'1'],
                                                                                                                 df[col+'2'],
                                                                                                                 df[col+'3'])]
        if not any(df[col].apply(lambda x: len(x) > 1).tolist()):
            # change set to string
            print("reformatting {}".format(col))
            df[col] = df[col].apply(lambda x: x.pop() if x != set() else np.nan)
        else:
            print("{} column has multiple unique entries".format(col))
            print("see table for new entries")
            # keep the value that has the most characters, otherwise change set to string
            old = df[df[col].apply(lambda x: len(x) > 1)][col]
            df[col] = df[col].apply(lambda x: x.pop() if len(x) == 1 else np.nan if x == set() else max(list(x), key=len))
            # new dataframe to keep track of changes
            change_df = pd.DataFrame([old, df.loc[old.index][col]]).T
            change_df.columns = ['old', 'new']
            print(change_df)

    # add functions to combine asset totals, handle missing debt values

    return df[['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
               'town', 'state', 'occupation']]

In [92]:
"""ind1, ind2, ind3 = CT_CD[['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index,
                   CT_CD[['6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index,
                   CT_CD[['3p_Dollar', '3p_Cents']].dropna(thresh = 1).index
ind = set(ind1).intersection(ind2).intersection(ind3)"""

"ind1, ind2, ind3 = CT_CD[['6p_Dollar', '6p_Cents']].dropna(thresh = 1).index,\n                   CT_CD[['6p_def_Dollar', '6p_def_Cents']].dropna(thresh = 1).index,\n                   CT_CD[['3p_Dollar', '3p_Cents']].dropna(thresh = 1).index\nind = set(ind1).intersection(ind2).intersection(ind3)"

## Import Data - INCOMPLETE
1. Import CD and ASD for each state, combine the multiple town/state/occupation columns (if they exist) into one
2. Concatenate all the separate datasets into two (one CD and one ASD)

In [254]:
CD_all = pd.DataFrame(columns = ['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                                  'town', 'state', 'occupation'])
ASD_all = pd.DataFrame(columns = ['6p_Dollar', '6p_Cents', '6p_def_Dollar', '6p_def_Cents','3p_Dollar', '3p_Cents',
                                  'town', 'state', 'occupation'])

In [240]:
# importing desired columns and rename
CT_CD_raw = pd.read_excel("../../Data/Post1790/CT/CT_post1790_CD_ledger.xlsx",
                      header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_CD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_CD = combineCols(CT_CD_raw)
CD_all = pd.concat([CD_all, CT_CD])

town column has multiple unique entries
see table for new entries
                   old      new
526  {Milford, Miford}  Milford
799  {Nowich, Norwich}  Norwich
reformatting state
reformatting occupation


In [256]:
CT_CD['county'].unique()

array(['Hartford County', 'Tolland County', nan, 'New London County',
       'New Haven County', 'Litchfield County', 'Fairfield County',
       'Windham County', 'Middlesex County'], dtype=object)

In [241]:
# importing desired columns and rename
CT_ASD_raw = pd.read_excel("../../Data/Post1790/CT/CT_post1790_ASD_ledger.xlsx",
                       header = 13, usecols = 'H, I, J, K, L, N, O, X, Y, Z, AA, AB, AD, AE, AN, AO, AP, AQ, AR, AT, AU')
CT_ASD_raw.columns = ['First Name', 'Last Name', 'town1', 'state1', 'occupation1', '6p_Dollar', '6p_Cents',
                     'First Name.1', 'Last Name.1', 'town2', 'state2', 'occupation2', '6p_def_Dollar', '6p_def_Cents',
                     'First Name.2', 'Last Name.2', 'town3', 'state3', 'occupation3', '3p_Dollar', '3p_Cents', ]
# unify occupation, town and state columns
CT_ASD = combineCols(CT_ASD_raw)
ASD_all = pd.concat([ASD_all, CT_ASD])

town column has multiple unique entries
see table for new entries
                    old      new
125  {Norwall, Norwalk}  Norwall
reformatting state
occupation column has multiple unique entries
see table for new entries
                                                   old  \
811  {1st Society in Lyme, Treasuer 1st Society in ...   

                              new  
811  Treasuer 1st Society in Lyme  


## Mapping Town/City to Counties - INCOMPLETE
1. Connecticut: Referencing <a href = "https://ctstatelibrary.org/cttowns/counties">https://ctstatelibrary.org/cttowns/counties</a> I found that Huntington is now called Shelton and Chatham is now called East Hampton. The other two cases below are not mappable because those are not valid town names.

In [242]:
# fuzzy string matching function
def fuzzyTownMatch(towns, crosswalk, initial = True):
    # which column contains list of unmatched names
    if initial:
        unmatched_towns = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town']
    else:
        unmatched_towns = towns[towns['county'].apply(lambda x: pd.isnull(x))]['town2']

    for town in unmatched_towns:
        # extract best match
        match_tuple = process.extractOne(town, crosswalk['primary_city'])
        score = match_tuple[1]
        match = match_tuple[0]
        # if match above threshold, change + print match so we can hand check
        if score >= 85:
            county = primary_dict[match]
            if initial:
                print("{} -> {} in {}".format(town, match, county))
            else:
                original_town = towns[towns['town2'] == town]['town'].tolist()[0]
                print("{} (new name: {}) -> {} in {}".format(original_town, town, match, county))
            town_index = towns[towns['town'] == town].index
            towns.loc[town_index, 'county'] = county
        else:
            print("Cannot find a match for the town {}".format(town))
    return towns

In [None]:
city_county_cw = pd.read_excel("../../Data/zip_code_database.xls")[['primary_city', 'acceptable_cities',
                                                                    'unacceptable_cities', 'county', 'state']]
# list_of_states is not functional rn
list_of_states = CD_all['state'].unique().tolist()
print(list_of_states)
assert(len(list_of_states) == 13)

In [250]:
final_cw = pd.DataFrame(columns = ['town', 'county', 'state'])
for state in list_of_states:
    # create list of towns for each state
    towns = CT_CD[CT_CD['state'] == state][['town']].drop_duplicates()
    towns = towns[towns['town'].apply(lambda x: not pd.isnull(x))]
    # state crosswalk
    state_cw = city_county_cw[city_county_cw['state'] == state]

    # try initial merge, add counties
    primary_dict = dict(zip(state_cw['primary_city'],state_cw['county']))
    towns['county'] = towns['town'].apply(lambda x: primary_dict.get(x, np.nan))

    # try fuzzy string match merge
    towns = fuzzyTownMatch(towns, state_cw)

    if state == 'CT':
        # modify towns that changed names - see note at very top
        towns['town2'] = towns['town'].apply(lambda x: x.replace('Huntington', 'Shelton').replace('Chatham', 'East Hampton'))
        towns[towns['county'].apply(lambda x: pd.isnull(x))]

    towns = fuzzyTownMatch(towns, state_cw, False)
    towns = towns[towns['county'].apply(lambda x: not pd.isnull(x))]
    towns['state'] = state
    final_cw = pd.concat([final_cw, towns.drop('town2', axis = 1)])

NameError: name 'list_of_states' is not defined

In [266]:
CT_CD['debt_total'] = CT_CD['6p_Dollar'].fillna(0) + CT_CD['6p_Cents'].fillna(0)
CT_CD.groupby('county')['debt_total'].sum().to_csv('../../Data/CT_debt_temp.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CT_CD['debt_total'] = CT_CD['6p_Dollar'].fillna(0) + CT_CD['6p_Cents'].fillna(0)
