In [81]:
import pandas as pd
import csv
import os

# Parse Master
Parses all census tables via the intermediary form and combines them into a master file with `cfips` and `year`. Also brings in density data from `density.csv`.

In [82]:
# Parse `density.csv` first, aggregate monthly data to a yearly level
df = pd.read_csv('./density.csv')
df['year'] = df['first_day_of_month'].apply(lambda x: int(x[:4]))
df['data-year'] = df['year'].apply(lambda x: str(x-2))
df = df.groupby(['year', 'data-year', 'cfips', 'county', 'state']).mean(numeric_only = True).reset_index()
df = df.filter(items=['year', 'data-year', 'cfips', 'county', 'state', 'microbusiness_density'])
print(df.head())

# Grab all feature files and add to dataframe
for path in os.listdir('./census-data'):
    if not path == 'template':
        table = pd.DataFrame()
        for file in os.listdir('./census-data/' + path + '/parsed'):
            year = file[:4]
            temp = pd.read_csv('./census-data/' + path + '/parsed/' + file)

            temp['data-year'] = year
            table = pd.concat([table, temp])
        df = pd.merge(df, table, how="left", on=['cfips', 'data-year'])

   year data-year  cfips          county    state  microbusiness_density
0  2019      2017   1001  Autauga County  Alabama               2.986972
1  2019      2017   1003  Baldwin County  Alabama               7.370375
2  2019      2017   1005  Barbour County  Alabama               1.046068
3  2019      2017   1007     Bibb County  Alabama               1.278288
4  2019      2017   1009   Blount County  Alabama               1.559113


In [83]:
# Inspect table
df.head()

Unnamed: 0,year,data-year,cfips,county,state,microbusiness_density,pct_born_us_citizen,pct_naturalized_us_citizen,pct_not_us_citizen,pct_house_price_gt1mill,...,pct_in_public_admin_industry,pct_employed_at_self_employed,pct_employed_at_non_profit,pct_employed_at_government,pct_housing_single_detached,pct_insured,pct_broadband,pct_college,pct_in_it_industry,median_hh_inc
0,2019,2017,1001,Autauga County,Alabama,2.986972,97.886838,1.115633,0.997529,0.0,...,11.152123,8.0,7.8,20.2,72.9,91.2,76.6,14.5,1.3,55317.0
1,2019,2017,1003,Baldwin County,Alabama,7.370375,96.751574,1.559304,1.689123,3.076923,...,4.838764,11.4,6.7,12.9,73.9,89.2,74.5,20.4,1.4,52562.0
2,2019,2017,1005,Barbour County,Alabama,1.046068,97.339796,1.003778,1.656425,2.631579,...,7.929714,9.9,4.9,19.1,57.2,87.7,57.2,7.6,0.5,33368.0
3,2019,2017,1007,Bibb County,Alabama,1.278288,98.976971,0.181577,0.841453,0.0,...,5.005507,8.1,6.0,17.4,66.8,91.9,62.0,8.1,1.2,43404.0
4,2019,2017,1009,Blount County,Alabama,1.559113,95.45494,1.576291,2.968769,2.12766,...,5.163704,7.9,5.6,11.9,71.3,89.0,65.8,8.7,1.3,47412.0


In [84]:
# Check all features present
print('# Features: ' + str(len(df.columns) - 6))
list(df.columns[6:])

# Features: 52


['pct_born_us_citizen',
 'pct_naturalized_us_citizen',
 'pct_not_us_citizen',
 'pct_house_price_gt1mill',
 'pct_house_price_500k_1mill',
 'pct_house_price_250k_500k',
 'pct_house_price_100k_250k',
 'pct_housing_vacant',
 'pct_housing_occupant_ratio_lt1',
 'pct_housing_grapi_gt35',
 'total_population',
 'old_age_dependency_ratio',
 'child_dependency_ratio',
 'median_age',
 'sex_ratio',
 'pct_moved_from_abroad',
 'pct_moved_outside_state',
 'pct_moved_within_state',
 'pct_moved_within_county',
 'pct_households_married',
 'avg_family_size',
 'pct_unmarried_same_sex_households',
 'pct_housing_owner_occupied',
 'pct_divorced',
 'pct_never_married',
 'birth_rate',
 'pct_new_young_mothers',
 'pct_k12_enrollment',
 'pct_k12_public_students',
 'pct_college_public_students',
 'pct_college_enrollment',
 'pct_with_scieng_degree',
 'pct_with_libarts_degree',
 'pct_with_business_degree',
 'pct_multilingual',
 'pct_below_poverty_level',
 'pct_disabled',
 'pct_snap_households',
 'pct_unemployed',
 'pc

In [85]:
# Check for any rows with NaN values
df[df.isna().any(axis=1)]

Unnamed: 0,year,data-year,cfips,county,state,microbusiness_density,pct_born_us_citizen,pct_naturalized_us_citizen,pct_not_us_citizen,pct_house_price_gt1mill,...,pct_in_public_admin_industry,pct_employed_at_self_employed,pct_employed_at_non_profit,pct_employed_at_government,pct_housing_single_detached,pct_insured,pct_broadband,pct_college,pct_in_it_industry,median_hh_inc
546,2019,2017,15005,Kalawao County,Hawaii,9.285714,93.023256,6.976744,0.0,0.0,...,34.920635,3.2,1.6,61.9,84.9,92.2,60.4,13.9,0.0,61750.0
3681,2020,2018,15005,Kalawao County,Hawaii,10.915493,97.333333,2.666667,0.0,0.0,...,38.596491,3.5,1.8,59.6,85.1,97.1,59.6,18.8,0.0,61875.0
4948,2020,2018,35039,Rio Arriba County,New Mexico,2.526204,96.333986,1.389066,2.276948,0.0,...,,,,,55.1,89.4,52.1,12.5,,
6816,2021,2019,15005,Kalawao County,Hawaii,9.358974,95.454545,3.030303,1.515152,0.0,...,32.692308,1.9,7.7,55.8,87.2,100.0,66.7,19.4,0.0,69375.0
9951,2022,2020,15005,Kalawao County,Hawaii,1.195402,99.311927,0.458716,0.229358,0.0,...,65.647059,0.5,0.9,77.6,98.7,100.0,97.1,38.3,0.0,76465.0
12045,2022,2020,48243,Jeff Davis County,Texas,5.362546,89.933185,9.977728,0.089087,0.0,...,1.456954,27.5,1.2,26.8,79.0,75.9,63.2,14.3,0.0,


In [86]:
# Write out master table to csv
df.to_csv('../master.csv', index=False)