In [190]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
import os



## Cleaning town names

Goals:
- Clean town names that are different but refer to the same town in `ASD_all.xlsx` and `CD_all.xlsx`.
- check if all town names are in the "town-county" matching list given by `final_cw.xlsx` - if so, perform the matching.


Step 1: flag by 1 records that have NA "state" and "town" values.

In [191]:
# import and cleaning
ASD_all = pd.read_excel(
    'ASD_all.xlsx', index_col=0).reset_index(drop=True).dropna(how='all').drop_duplicates().reset_index(drop=True)

# CD_all = pd.read_excel(
#     'CD_all.xlsx', index_col=0).reset_index(drop=True).dropna(how='all').drop_duplicates().reset_index(drop=True)

CD_all = pd.read_csv('../../Data/Post1790/aggregated_CD_noname.csv', index_col=0).reset_index(drop=True)

# strip white space
for col in ['town', 'state', 'occupation']:
    ASD_all[col] = ASD_all[col].str.strip()
    CD_all[col] = CD_all[col].str.strip()

ASD_all['orig_town'] = ASD_all['town']
CD_all['orig_town'] = CD_all['town']
    
print(ASD_all.shape, CD_all.shape)

CD_all.drop(['name_type', 'county'], axis=1, inplace=True)

(5674, 10) (4377, 13)


In [192]:
ASD_all['FLAG'], CD_all['FLAG'] = 0, 0

# identify NA rows
ASD_all.loc[(pd.isna(ASD_all['town'])) | (pd.isna(ASD_all['state'])), 'FLAG'] = 1
CD_all.loc[(pd.isna(CD_all['town'])) | (pd.isna(CD_all['state'])), 'FLAG'] = 1

# select non-NA subdataframe
ASD_no_NA = ASD_all[ASD_all['FLAG']==0]
CD_no_NA = CD_all[CD_all['FLAG']==0]

In [193]:
# check all states
print(
    set(list(ASD_no_NA.state.unique()) + list(CD_no_NA.state.unique()))
)

{'NJ', 'US', 'RI', 'NH', 'MA', 'DE', 'NC', 'BVI', 'MD', 'PA', 'BM', 'CT', 'SC', 'FR', 'FC', 'VA', 'NY', 'VT', 'GB', 'VI', 'GA'}


Step 2: flag by 2 the records with states outside the following list of states.

In [194]:
state_list = ['RI', 'CT', 'GA', 'MD', 'NC', 'NH', 'NJ', 'NY', 'VA', 'PA', 'RI', 'SC', 'DE', 'MA', 'VT']
state_name_list = [
            'Rhode Island', 'Connecticut', 'Georgia', 'Maryland', 'North Carolina', 
            'New Hampshire', 'New Jersey', 'New York', 'Virginia',
            'Pennsylvania', 'Rhode Island', 'South Carolina', 'Delaware', 
            'Massachusetts', 'Vermont'
            ]
state_list_dict = dict(zip(state_list, state_name_list))

ASD_no_NA.loc[ASD_no_NA.apply(lambda row: row.state not in state_list, axis=1), 'FLAG'] = 2
CD_no_NA.loc[CD_no_NA.apply(lambda row: row.state not in state_list, axis=1), 'FLAG'] = 2

# update to the original df
ASD_all.update(ASD_no_NA)
CD_all.update(CD_no_NA)


CD_all[CD_all['FLAG']==2].head()

Unnamed: 0,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents,town,state,occupation,6p_total,orig_town,FLAG
1753,19.0,,9.0,4.0,,,Eustatia,BVI,,,Eustatia,2.0
1754,125.0,45.0,62.0,73.0,134.0,87.0,Eustatia,BVI,,125.45,Eustatia,2.0
1755,90.0,,45.0,,93.0,84.0,Eustatia,BVI,,,Eustatia,2.0
3974,640.0,4.0,320.0,2.0,406.0,72.0,Bermuda,BM,,640.04,Bermuda,2.0
4053,61.0,50.0,30.0,75.0,57.0,16.0,Register of the Treasury,US,,61.5,Register of the Treasury,2.0


Step 3: flag by 3 the records
- with town name 'State of XX' or 'State XX' or 'XX State' where XX is the state where the town is located, and 
- with town name exactly or almost the same as the state name (due to typos).

These are records for which "township" is not available.

In [195]:
ASD_rest = ASD_all[ASD_all['FLAG']==0]
CD_rest = CD_all[CD_all['FLAG']==0]

# state of XX/state XX - checked
ASD_rest.loc[
    ASD_rest.apply(lambda row: row.town.lower().startswith('state '), axis=1), 'FLAG'] = 3
CD_rest.loc[
    CD_rest.apply(lambda row: row.town.lower().startswith('state '), axis=1), 'FLAG'] = 3
ASD_rest.loc[ASD_rest.town=='Delaware State', 'FLAG'] = 3
CD_rest.loc[CD_rest.town=='Delaware State', 'FLAG'] = 3

# town name == state name - checked
ASD_rest.loc[ASD_rest.apply(lambda row: row.town==state_list_dict[row.state], axis=1), 'FLAG'] = 3
CD_rest.loc[CD_rest.apply(lambda row: row.town==state_list_dict[row.state], axis=1), 'FLAG'] = 3

# town name ~= state name - checked
ASD_rest.loc[
    ASD_rest.apply(lambda row: 
    process.cdist([row.town], [state_list_dict[row.state]])[0][0] >= 80, axis=1), 'FLAG'
    ] = 3
CD_rest.loc[
    CD_rest.apply(lambda row: 
    process.cdist([row.town], [state_list_dict[row.state]])[0][0] >= 80, axis=1), 'FLAG'
    ] = 3

# Carolina in South Carolina (no county named Carolina)
ASD_rest.loc[
    ASD_rest.apply(lambda row: row.town in state_list_dict[row.state] and
    row.town != state_list_dict[row.state], axis=1), 'FLAG'] = 3
CD_rest.loc[
    CD_rest.apply(lambda row: row.town in state_list_dict[row.state] and
    row.town != state_list_dict[row.state], axis=1), 'FLAG'] = 3

# update
ASD_all.update(ASD_rest)
CD_all.update(CD_rest)
CD_all[CD_all['FLAG']==3].head()

Unnamed: 0,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents,town,state,occupation,6p_total,orig_town,FLAG
2,154.0,20.0,77.0,10.0,192.0,,Rhode Island,RI,Farmer,154.2,Rhode Island,3.0
36,60.0,37.0,30.0,18.0,63.0,83.0,Rhode Island,RI,,60.37,Rhode Island,3.0
103,248.0,46.0,124.0,23.0,144.0,,State of New York,NY,,248.46,State of New York,3.0
123,738.0,43.0,369.0,22.0,708.0,13.0,State of New York,NY,,738.43,State of New York,3.0
168,30.0,48.0,15.0,24.0,8.0,22.0,State of Vermont,VT,,30.48,State of Vermont,3.0


Step 4: unify town names
- remove `"state of" + state_name` and `"of" + state_name` that appear in the town name, except "City of New York",
- remove `"state" + state_name` that appears in the town name,
- remove `state_name` from `XX + state_name` or `state_name + XX`,
- take into account the three special cases.

Then, 
- create a new identifier and remove "Town"/"County" from town names.

Flag this change by 4.

In [196]:
ASD_rest = ASD_all[ASD_all.FLAG==0]
CD_rest = CD_all[CD_all.FLAG==0]

def remove_state_from_town(row):
    state_name = state_list_dict[row.state]
    # state of/of
    row.town = row.town.replace(' State of ' + state_name, '')
    # state + state_name
    row.town = row.town.replace(' State ' + state_name, '')
    # special cases
    if row.town == 'Boston state Massachusetts':
        row.town = 'Boston'

    if row.town == 'New Castle County Delaware State':
        row.town = 'New Castle County'

    if row.town == 'Virginia and Philadelphia':
        row.town = 'Philadelphia'

    if row.town != 'City of New York':
        row.town = row.town.replace(' of ' + state_name, '')
        row.town = row.town.replace(state_name, '')
    
    # flag changes
    row.FLAG = 4
    return row

ASD_rest = ASD_rest.apply(lambda row: remove_state_from_town(row), axis=1)
CD_rest = CD_rest.apply(lambda row: remove_state_from_town(row), axis=1)

# town_level = 'T' if specified 'Town', 'C' if specified 'County', otherwise 'U'
ASD_rest['town_level'] = 'U'
ASD_rest.loc[ASD_rest['town'].str.contains('County', na=False), 'town_level'] = 'C'
ASD_rest.loc[ASD_rest['town'].str.contains('Town', na=False), 'town_level'] = 'T'

def remove_CountyTown(row):
    row.town = row.town.replace('County', '')
    row.town = row.town.replace('Town', '')
    row.FLAG = 4
    return row
    
ASD_rest = ASD_rest.apply(lambda row: remove_CountyTown(row), axis=1)
CD_rest = CD_rest.apply(lambda row: remove_CountyTown(row), axis=1)


ASD_all.update(ASD_rest)
CD_all.update(CD_rest)

Step 5: match town names that are likely to be the same one  
1. for each state, create a list `A` of town names with # of occurences >= 4; create a list `B` for all the rest towns.
2. for each town in list `B`, find the best three matches with towns in list `A` - if above a threshold, report all three and match to the best one. - to be checked afterwards
3. all unmatched towns in list `B` become a new list `C` - we WANT to compare one another and if the similarity is above a threshold, group them; otherwise keep it untouched. One simple procedure is for each town in list `C`, group it with all other towns whose distance to it is smaller than a threshold. Then proceed to the next one if it's not in some group already and skip otherwise. This is legitimate because we expect "typos" to cause small differences among all mistyped names. Report all the matched and unmatched cases in this round.
4. For all above, allow user's input to manually confirm the matches.

Flag this change by 5.

In [197]:
def match_towns(df):
    if len(df.index)==0:    # Vermont
        return df
    print(f"State: {df.state.iloc[0]}")

    val_counts = df.town.value_counts(sort=True)
    list_A, list_B = list(val_counts[val_counts>=min(val_counts.iloc[0], 4)].index), list(val_counts[val_counts<min(val_counts.iloc[0], 6)].index)
    list_C = list_B.copy()
    for town in list_B:
        best3 = process.extract(town, list_A)[0:3]
        if best3[0][1] >= 85:
            print(f"{town}. Candidates: {[x[0] for x in best3]}. Matched: {best3[0][0]}.\n")

            # # user input
            # ACCEPT = input("1 for ACCEPT. 0 for REJECT")
            # # records

            # if ACCEPT==1:
            # matched to list_A
            list_C.remove(town)
            df.loc[df.town==town, 'FLAG'] = 5
            df.loc[df.town==town, 'town'] = best3[0][0]

    list_C_flag = [-1 for x in list_C]
    for id, town in enumerate(list_C):
        # only do matching if not already matched
        if list_C_flag[id] == -1:
            bests = [x[0] for x in process.extract(town, list_C, score_cutoff=85)]
            if len(bests) > 1:
                # if not just oneself
                indexes = [list_C.index(x) for x in bests]
                # make sure same group has the same id
                print(f"Candidate group: {bests} -> {min(bests, key=len)}")

                # pick the shortest one as the name we want to keep
                # this deals with the cases like North Hampton -> Hampton
                index_selected = list_C.index(min(bests, key=len))
                
                for k in indexes: 
                    list_C_flag[k] = index_selected 

                # one special case
                if bests==['Cumberland', 'Cumb  ']:
                    list_C_flag[k] = list_C.index('Cumberland')
    
    for id, flag in enumerate(list_C_flag):
        if flag != -1:
            df.loc[df.town==list_C[id], 'FLAG'] = 5
            df.loc[df.town==list_C[id], 'town'] = list_C[flag]

    return df

In [198]:
# first stripping trailing space
ASD_rest['town'] = ASD_rest['town'].apply(lambda x: x.strip())
CD_rest['town'] = CD_rest['town'].apply(lambda x: x.strip())

# first match to show list (but no update)
for state_code in state_list:
    
    ASD_rest[ASD_rest.state==state_code] = match_towns(ASD_rest[ASD_rest.state==state_code])
    CD_rest[CD_rest.state==state_code] = match_towns(CD_rest[CD_rest.state==state_code])

State: RI
Johnson. Candidates: ['Johnson', 'Johnston', 'South Kingston']. Matched: Johnson.

Portsmouth. Candidates: ['Portsmouth', 'North Providence', 'South Kingston']. Matched: Portsmouth.

Richmond. Candidates: ['Richmond', 'East Greenwich', 'West Greenwich']. Matched: Richmond.

Conventry. Candidates: ['Coventry', 'Exeter', 'North Providence']. Matched: Coventry.

Glocester. Candidates: ['Gloucester', 'Exeter', 'Coventry']. Matched: Gloucester.

North Kingstone. Candidates: ['North Kingston', 'South Kingston', 'Newport']. Matched: North Kingston.

North Kingstown. Candidates: ['North Kingston', 'South Kingston', 'Newport']. Matched: North Kingston.

Stoughton now of Providence. Candidates: ['Providence', 'North Providence', 'Johnston']. Matched: Providence.

Smithfeild. Candidates: ['Smithfield', 'Scituate', 'Cumberland']. Matched: Smithfield.

South Kingstone. Candidates: ['South Kingston', 'North Kingston', 'Johnston']. Matched: South Kingston.

Warnick. Candidates: ['Warwick', 

In [199]:
# keep misclassified cases untouched
ASD_rest['mskip'], CD_rest['mskip'] = 0, 0

ASD_rest.loc[
    ASD_rest.apply(lambda row: row.town in ['Stafford', 'New Hartford', 'New Brunswick', 'St Lukes', "St George's Parish"], axis=1), 'mskip'
] = 1
CD_rest.loc[
    CD_rest.apply(lambda row: row.town in ['Stafford', 'New Hartford', 'New Brunswick', 'St Lukes', "St George's Parish"], axis=1), 'mskip'
] = 1

ASD_rest.loc[ASD_rest.town=='George town', 'town'] = 'Georgetown'
CD_rest.loc[CD_rest.town=='George town', 'town'] = 'Georgetown'

# keep misgrouped cases untouched
ASD_rest.loc[ASD_rest.town=="St Johnn's Parish", 'town'] = "St John's Parish"
CD_rest.loc[CD_rest.town=="St Johnn's Parish", 'town'] = "St John's Parish"

ASD_rest.loc[ASD_rest.town=="St Bartholomew's", 'town'] = "St Bartholomew's Parish"
CD_rest.loc[CD_rest.town=="St Bartholomew's", 'town'] = "St Bartholomew's Parish"
ASD_rest.loc[ASD_rest.town=="Bartholomew's Parish", 'town'] = "St Bartholomew's Parish"
CD_rest.loc[CD_rest.town=="Bartholomew's Parish", 'town'] = "St Bartholomew's Parish"

ASD_rest.loc[
    ASD_rest.apply(
        lambda row: row.town in ["St Luke's", "St John's Parish", "St Bartholomew's",  'St Helena', 'St Gustavus', "John's Island"], axis=1
        ), 'mskip'
] = 1
CD_rest.loc[
    CD_rest.apply(
        lambda row: row.town in ["St Luke's", "St John's Parish", "St Bartholomew's",  'St Helena', 'St Gustavus', "John's Island"], axis=1
        ), 'mskip'
] = 1

ASD_rest.loc[ASD_rest.apply(lambda row: row.town in ['Richard Sennings', 'Richard'], axis=1), 'mskip'] = 1
CD_rest.loc[CD_rest.apply(lambda row: row.town in ['Richard Sennings', 'Richard'], axis=1), 'mskip'] = 1

In [200]:
# match again
ASD_rest = ASD_rest[ASD_rest.mskip==0]
CD_rest = CD_rest[CD_rest.mskip==0]

for state_code in state_list:
    
    ASD_rest[ASD_rest.state==state_code] = match_towns(ASD_rest[ASD_rest.state==state_code])
    CD_rest[CD_rest.state==state_code] = match_towns(CD_rest[CD_rest.state==state_code])

# deal with some other special cases
ASD_rest.loc[ASD_rest.town=='East Haddam', 'town'] = 'Haddam'
CD_rest.loc[CD_rest.town=='East Haddam', 'town'] = 'Haddam'

ASD_all.update(ASD_rest)
CD_all.update(CD_rest)

State: RI
Johnson. Candidates: ['Johnson', 'Johnston', 'North Kingston']. Matched: Johnson.

Portsmouth. Candidates: ['Portsmouth', 'North Providence', 'South Kingston']. Matched: Portsmouth.

Richmond. Candidates: ['Richmond', 'East Greenwich', 'West Greenwich']. Matched: Richmond.

State: RI
Coventry. Candidates: ['Coventry', 'North Providence', 'Providence']. Matched: Coventry.

Johnston. Candidates: ['Johnston', 'Johnson', 'South Kingston']. Matched: Johnston.

Johnson. Candidates: ['Johnson', 'Johnston', 'South Kingston']. Matched: Johnson.

State: CT
Huntington. Candidates: ['Huntington', 'Groton', 'Stonington']. Matched: Huntington.

Montville. Candidates: ['Montville', 'Lyme', 'Mansfield']. Matched: Montville.

Thompson. Candidates: ['Thompson', 'Preston', 'Stonington']. Matched: Thompson.

Torrington. Candidates: ['Torrington', 'Stonington', 'Groton']. Matched: Torrington.

Berlin. Candidates: ['Berlin', 'Bolton', 'Hebron']. Matched: Berlin.

Canterbury. Candidates: ['Canterbu

### Now maps to the County-town list

There are a few cases where the `orig_town` names include `'County'`, but gets mapped to nothing on the list. We assign `county` to `name_type` in this case, and let `county` be county name.

In [201]:
ASD_no_NA = ASD_all[ASD_all['FLAG']!=1]
CD_no_NA = CD_all[CD_all['FLAG']!=1]

matchlist = pd.read_csv('../../Data/AssetGeography/county_cw.csv', index_col=0)
matchlist = matchlist.rename({'town': 'orig_town'}, axis=1)
matchlist.head()

Unnamed: 0,orig_town,state,county,name_type
0,Hartford,CT,Hartford County,town
1,Bolton,CT,Tolland County,town
2,Rhode Island,RI,,state
5,Wethersfield,CT,Hartford County,town
6,Lyme,CT,New London County,town


In [202]:
ASD_merged = pd.merge(left=ASD_no_NA, right=matchlist, how='left', on=['state', 'orig_town'])
CD_merged = pd.merge(left=CD_no_NA, right=matchlist, how='left', on=['state', 'orig_town'])

# select
filter_ASD = (ASD_merged.orig_town.str.contains('County')) & (ASD_merged.name_type.isna())
filter_CD = (CD_merged.orig_town.str.contains('County')) & (CD_merged.name_type.isna())
ASD_merged.loc[filter_ASD]

Unnamed: 0,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents,town,state,occupation,orig_town,FLAG,county,name_type
4037,1634.0,-0.14,1634.0,-0.08,1634.0,0.38,Pendleton,SC,Planter,Pendleton County,4.0,,


In [203]:
ASD_merged.loc[filter_ASD, 'name_type'] = 'county'
ASD_merged.loc[filter_ASD, 'county'] = ASD_merged.loc[filter_ASD, 'town']

CD_merged.loc[filter_CD, 'name_type'] = 'county'
CD_merged.loc[filter_CD, 'county'] = CD_merged.loc[filter_CD, 'town']

CD_merged

Unnamed: 0,6p_Dollar,6p_Cents,6p_def_Dollar,6p_def_Cents,3p_Dollar,3p_Cents,town,state,occupation,6p_total,orig_town,FLAG,county,name_type
0,1064.0,75.0,532.0,37.0,508.0,51.0,Hartford,CT,Merchant,1064.75,Hartford,4.0,Hartford County,town
1,449.0,96.0,224.0,97.0,232.0,10.0,Bolton,CT,Farmer,449.96,Bolton,4.0,Tolland County,town
2,154.0,20.0,77.0,10.0,192.0,,Rhode Island,RI,Farmer,154.20,Rhode Island,3.0,,state
3,196.0,75.0,98.0,37.0,172.0,24.0,Hartford,CT,Merchant,196.75,Hartford,4.0,Hartford County,town
4,53.0,58.0,26.0,79.0,67.0,6.0,Hartford,CT,Merchant,53.58,Hartford,4.0,Hartford County,town
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3716,120.0,25.0,60.0,12.0,97.0,70.0,New York,NY,Merchant,120.25,New York,3.0,,state
3717,33.0,33.0,16.0,67.0,31.0,67.0,Maryland,MD,Shopkeeper,33.33,Maryland,3.0,,state
3718,124.0,33.0,62.0,17.0,118.0,41.0,Maryland,MD,Merchant,124.33,Maryland,3.0,,state
3719,242.0,67.0,121.0,33.0,79.0,52.0,Charleston,SC,,242.67,Charleston,4.0,Charleston County,town


In [204]:
# address Chris's comments
ASD_merged.loc[ASD_merged.town=='Brunswick', 'town'] = 'New Brunswick'
CD_merged.loc[CD_merged.town=='Brunswick', 'town'] = 'New Brunswick'

ASD_merged.loc[(ASD_merged.town=='Johnson') & (ASD_merged.state=='RI'), 'town'] = 'Johnston'
CD_merged.loc[(CD_merged.town=='Johnson') & (CD_merged.state=='RI'), 'town'] = 'Johnston'
ASD_merged.loc[(ASD_merged.town=='Georges') & (ASD_merged.state=='RI'), 'town'] = 'Georgetown'
CD_merged.loc[(CD_merged.town=='Georges') & (CD_merged.state=='RI'), 'town'] = 'Georgetown'

# Chester
ASD_merged.loc[(ASD_merged.orig_town.str.contains('Chester and County')) |
            (ASD_merged.orig_town.str.contains('Chester County'))  |
            (ASD_merged.orig_town.str.contains('Chester Co')), 'name_type'] = 'county'

CD_merged.loc[(CD_merged.orig_town.str.contains('Chester and County')) |
            (CD_merged.orig_town.str.contains('Chester County'))  |
            (CD_merged.orig_town.str.contains('Chester Co')), 'name_type'] = 'county'
    
# Northern Liberties - but already marked as county
ASD_merged.loc[ASD_merged.town=='Northern Liberties', 'name_type'] = 'county'
CD_merged.loc[CD_merged.town=='Northern Liberties', 'name_type'] = 'county'

ASD_merged.loc[(ASD_merged.town=='Cumb') | (ASD_merged.town=='Cumberland'), 'name_type'] = 'county'
CD_merged.loc[(CD_merged.town=='Cumb') | (CD_merged.town=='Cumberland'), 'name_type'] = 'county'

In [205]:
# assign nan to town if name_type == county/state
ASD_merged.loc[(ASD_merged.name_type=='county') | (ASD_merged.name_type=='state'), 'town'] = np.nan
CD_merged.loc[(CD_merged.name_type=='county') | (CD_merged.name_type=='state'), 'town'] = np.nan

ASD_df= ASD_merged.drop(['orig_town'], axis=1)
CD_df = CD_merged.drop(['orig_town'], axis=1)

ASD_df_strNaN = ASD_df.fillna('NaN')
CD_df_strNaN = CD_df.fillna('NaN')

In [206]:
states = {"Connecticut": "CT","Delaware": "DE","Georgia": "GA", "Maryland": "MD", "Massachusetts": "MA",
          "New Hampshire": "NH", "New Jersey": "NJ", "New York": "NY","North Carolina": "NC",
          "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC", "Virginia": "VA", }

In [207]:
# aggregate data from colonies & noncolonies
CD_df_str_names = CD_df.copy()
CD_df_str_names[['county', 'town']] = CD_df_str_names[['county', 'town']].fillna('nan')
colonies = CD_df_str_names.loc[(~CD_merged.name_type.isna()) & (CD_merged.name_type!='other')]
noncolonies = CD_df_str_names.loc[(CD_merged.name_type.isna()) | (CD_merged.name_type=='other')]
colonies = pd.concat([colonies, noncolonies[noncolonies['state'].apply(lambda x: x in states.values())]])
noncolonies = noncolonies[noncolonies['state'].apply(lambda x: x not in states.values())]

# find total amount of assets held in each town-county-state
aggregated_data = colonies.groupby(['state', 'county', 'town'], dropna=False).agg({'6p_total':['sum', 'size']}).reset_index()
aggregated_data.columns = ['state', 'county', 'town', '6p_total_sum', '6p_total_count']
aggregated_data['town/county pct'] = np.round(aggregated_data['6p_total_sum'] / \
                                     aggregated_data.groupby(['state', 'county'])['6p_total_sum'].transform('sum')*100, 1)
aggregated_data['county/state pct'] = np.round(aggregated_data.groupby(['state', 'county'])['6p_total_sum'].transform('sum') / \
                                    aggregated_data.groupby('state')['6p_total_sum'].transform('sum')*100, 1)
aggregated_data['state/ovall (excluding non-colonies) pct'] = np.round(aggregated_data.groupby(['state'])['6p_total_sum'].transform('sum') / \
                                                              aggregated_data['6p_total_sum'].sum()*100, 1)

noncolonies_agg = noncolonies.groupby(['state', 'county', 'town'], dropna=False).agg({'6p_total':['sum', 'size']}).reset_index()
noncolonies_agg.columns = ['state', 'county', 'town', '6p_total_sum', '6p_total_count']
pd.concat([aggregated_data, noncolonies_agg]).to_csv('CD_geographical_table_summary.csv')

In [208]:
# export crosswalk for original town name, correct town name, county and state
mapping_df = CD_merged[['orig_town', 'town', 'county', 'state']]
mapping_df.drop_duplicates().to_csv('../../Data/AssetGeography/final_geographical_cw.csv')