## Import LEHD RAC files and generate table summaries for the various working geographies

In [18]:
import pandas as pd
import numpy as np

#### Read in RAC or WAC files

In [19]:
df1 = pd.read_csv('data/ct_rac_S000_JT00_2017.csv')
df2 = pd.read_csv('data/ny_rac_S000_JT00_2017.csv')
df3 = pd.read_csv('data/nj_rac_S000_JT00_2017.csv')

In [20]:
df = pd.concat([df1,df2,df3])

In [21]:
data_cols = ['C000', 'CA01', 'CA02', 'CA03', 'CE01', 'CE02', 'CE03',
       'CNS01', 'CNS02', 'CNS03', 'CNS04', 'CNS05', 'CNS06', 'CNS07', 'CNS08',
       'CNS09', 'CNS10', 'CNS11', 'CNS12', 'CNS13', 'CNS14', 'CNS15', 'CNS16',
       'CNS17', 'CNS18', 'CNS19', 'CNS20', 'CR01', 'CR02', 'CR03', 'CR04',
       'CR05', 'CR07', 'CT01', 'CT02', 'CD01', 'CD02', 'CD03', 'CD04', 'CS01',
       'CS02']

#### Read in geo crosswalk file, merge with data, and clean up data types

In [22]:
geo = pd.read_csv('data/geoxwalk_all.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [23]:
dff = pd.merge(df, geo, left_on="h_geocode", right_on="tabblk2010", how="inner")

In [24]:
#fix the issue with mixed strings and ints within the recode columns
geoid_cols = ['subplbor','subpl','subplnta','subplpuma']

for column_name in dff.columns:
    if column_name in geoid_cols:
        dff[column_name] = dff[column_name].apply(str)
    else:
        pass

In [25]:
df_subplbor = pd.concat([dff[['cty','subplbor','subplbor_n']],dff[data_cols]],axis=1)
df_subpl = pd.concat([dff[['cty','subpl', 'subpl_n']],dff[data_cols]],axis=1)
df_subplnta = pd.concat([dff[['cty','subplnta', 'subplnta_n']],dff[data_cols]],axis=1)
df_subplpuma = pd.concat([dff[['cty','subplpuma', 'subplpuma_n']],dff[data_cols]],axis=1)

### Group by various summary geographies

In [26]:
df_subplbor = df_subplbor.groupby(['cty','subplbor','subplbor_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

In [27]:
df_subpl = df_subpl.groupby(['cty','subpl','subpl_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

In [28]:
df_subplnta = df_subplnta.groupby(['cty','subplnta','subplnta_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

In [29]:
df_subplpuma = df_subplpuma.groupby(['cty','subplpuma','subplpuma_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

### Export tables to Excel

In [30]:
df_subplbor.to_excel('output/LEHD/subplbor_JT002017_RAC.xlsx')
df_subpl.to_excel('output/LEHD/subpl_JT002017_RAC.xlsx')
df_subplnta.to_excel('output/LEHD/subplnta_JT002017_RAC.xlsx')

### Run calcs on table

In [31]:
office = ['CNS09','CNS10','CNS11','CNS12','CNS13','CNS14']
institutional = ['CNS15', 'CNS16']
industrial = ['CNS01','CNS02','CNS03','CNS04','CNS05','CNS06','CNS08']
localserv = ['CNS07','CNS17','CNS18','CNS19']
gov = ['CNS20']

In [32]:
df_subplnta['Office'] = df_subplnta.loc[:,office].sum(axis=1)
df_subplnta['Institutional'] = df_subplnta.loc[:,institutional].sum(axis=1)
df_subplnta['Industrial'] = df_subplnta.loc[:,industrial].sum(axis=1)
df_subplnta['Local Services'] = df_subplnta.loc[:,localserv].sum(axis=1)
df_subplnta['Public Admin'] = df_subplnta.loc[:,gov].sum(axis=1)

In [33]:
df_subplnta.head()

Unnamed: 0,cty,subplnta,subplnta_n,C000,CNS01,CNS02,CNS03,CNS04,CNS05,CNS06,...,CNS16,CNS17,CNS18,CNS19,CNS20,Office,Institutional,Industrial,Local Services,Public Admin
0,9001,900104720,"Bethel town (Fairfield, CT)",9731,25,6,45,445,896,421,...,1415,240,701,388,215,2394,2534,2045,2543,215
1,9001,900108070,"Bridgeport town (Fairfield, CT)",57535,50,10,191,1690,4693,1723,...,13059,1207,5331,2768,1270,11654,17135,10627,16849,1270
2,9001,900108980,"Brookfield town (Fairfield, CT)",7730,8,0,35,345,611,352,...,1110,185,480,321,200,1942,2074,1538,1976,200
3,9001,900118500,"Danbury town (Fairfield, CT)",37135,62,8,136,1829,3507,1528,...,5976,717,3208,1574,717,8571,9262,7926,10659,717
4,9001,900118850,"Darien town (Fairfield, CT)",7529,7,0,18,227,246,295,...,765,298,482,343,114,3300,1446,922,1747,114


In [34]:
df_subplnta.to_excel('output/LEHD/subplnta_JT002017_RAC.xlsx')