## Import LEHD WAC files and generate table summaries for the various working geographies

In [1]:
import pandas as pd
import numpy as np

#### Read in WAC files

In [2]:
df1 = pd.read_csv('data/ct_wac_S000_JT00_2017.csv')
df2 = pd.read_csv('data/ny_wac_S000_JT00_2017.csv')
df3 = pd.read_csv('data/nj_wac_S000_JT00_2017.csv')

In [3]:
df = pd.concat([df1,df2,df3])

In [4]:
data_cols = ['C000', 'CA01', 'CA02', 'CA03', 'CE01', 'CE02', 'CE03',
       'CNS01', 'CNS02', 'CNS03', 'CNS04', 'CNS05', 'CNS06', 'CNS07', 'CNS08',
       'CNS09', 'CNS10', 'CNS11', 'CNS12', 'CNS13', 'CNS14', 'CNS15', 'CNS16',
       'CNS17', 'CNS18', 'CNS19', 'CNS20', 'CR01', 'CR02', 'CR03', 'CR04',
       'CR05', 'CR07', 'CT01', 'CT02', 'CD01', 'CD02', 'CD03', 'CD04', 'CS01',
       'CS02']

#### Read in geo crosswalk file, merge with data, and clean up data types

In [5]:
geo = pd.read_csv('data/geoxwalk_all.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
dff = pd.merge(df, geo, left_on="w_geocode", right_on="tabblk2010", how="inner")

In [7]:
#fix the issue with mixed strings and ints within the recode columns
geoid_cols = ['subplbor','subpl','subplnta','subplpuma']

for column_name in dff.columns:
    if column_name in geoid_cols:
        dff[column_name] = dff[column_name].apply(str)
    else:
        pass

In [8]:
df_subplbor = pd.concat([dff[['cty','subplbor','subplbor_n']],dff[data_cols]],axis=1)
df_subpl = pd.concat([dff[['cty','subpl', 'subpl_n']],dff[data_cols]],axis=1)
df_subplnta = pd.concat([dff[['cty','subplnta', 'subplnta_n']],dff[data_cols]],axis=1)
df_subplpuma = pd.concat([dff[['cty','subplpuma', 'subplpuma_n']],dff[data_cols]],axis=1)

### Group by various summary geographies

In [9]:
df_subplbor = df_subplbor.groupby(['cty','subplbor','subplbor_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

In [10]:
df_subpl = df_subpl.groupby(['cty','subpl','subpl_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

In [11]:
df_subplnta = df_subplnta.groupby(['cty','subplnta','subplnta_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

In [12]:
df_subplpuma = df_subplpuma.groupby(['cty','subplpuma','subplpuma_n']).agg({'C000':np.sum,'CNS01':np.sum,'CNS02':np.sum,'CNS03':np.sum,'CNS04':np.sum,\
                                                    'CNS05':np.sum,'CNS06':np.sum,'CNS07':np.sum,'CNS08':np.sum,'CNS09':np.sum,'CNS10':np.sum,\
                                                    'CNS11':np.sum,'CNS12':np.sum,'CNS13':np.sum,'CNS14':np.sum,'CNS15':np.sum,'CNS16':np.sum,\
                                                    'CNS17':np.sum,'CNS18':np.sum,'CNS19':np.sum,'CNS20':np.sum}).reset_index()

### Export tables to Excel

In [13]:
df_subplbor.to_excel('output/LEHD/subplbor_JT002017_WAC.xlsx')
df_subpl.to_excel('output/LEHD/subpl_JT002017_WAC.xlsx')
df_subplnta.to_excel('output/LEHD/subplpuma_JT002017_WAC.xlsx')

### Run calcs on table

In [14]:
office = ['CNS09','CNS10','CNS11','CNS12','CNS13','CNS14']
institutional = ['CNS15', 'CNS16']
industrial = ['CNS01','CNS02','CNS03','CNS04','CNS05','CNS06','CNS08']
localserv = ['CNS07','CNS17','CNS18','CNS19']
gov = ['CNS20']

In [15]:
df_subplnta['Office'] = df_subplnta.loc[:,office].sum(axis=1)
df_subplnta['Institutional'] = df_subplnta.loc[:,institutional].sum(axis=1)
df_subplnta['Industrial'] = df_subplnta.loc[:,industrial].sum(axis=1)
df_subplnta['Local Services'] = df_subplnta.loc[:,localserv].sum(axis=1)
df_subplnta['Public Admin'] = df_subplnta.loc[:,gov].sum(axis=1)

In [16]:
df_subplnta.head()

Unnamed: 0,cty,subplnta,subplnta_n,C000,CNS01,CNS02,CNS03,CNS04,CNS05,CNS06,...,CNS16,CNS17,CNS18,CNS19,CNS20,Office,Institutional,Industrial,Local Services,Public Admin
0,9001,900104720,"Bethel town (Fairfield, CT)",7442,4,0,8,475,1118,497,...,1411,105,592,265,82,1101,2135,2423,1701,82
1,9001,900108070,"Bridgeport town (Fairfield, CT)",43758,14,16,264,1431,3531,1522,...,13063,1028,2204,1659,2175,6941,18735,7796,8111,2175
2,9001,900108980,"Brookfield town (Fairfield, CT)",7124,0,0,0,485,930,320,...,732,194,571,571,153,832,1211,2008,2920,153
3,9001,900118500,"Danbury town (Fairfield, CT)",44309,3,0,47,1926,5446,1867,...,8602,525,3916,1426,657,8135,12377,9846,13294,657
4,9001,900118850,"Darien town (Fairfield, CT)",8837,1,0,3,168,47,361,...,685,666,1034,871,111,1973,1929,707,4117,111


In [17]:
df_subplnta.to_excel('output/LEHD/subplnta_JT002017_WAC.xlsx')