In [62]:
import pandas as pd
import csv
import os

# Parse Master
Parses all census tables via the intermediary form and combines them into a master file with `cfips` and `year`. Also brings in density data from `density.csv`.

In [63]:
# Parse `density.csv` first, aggregate monthly data to a yearly level
df = pd.read_csv('./density.csv')
df['year'] = df['first_day_of_month'].apply(lambda x: int(x[:4]))
df['data-year'] = df['year'].apply(lambda x: str(x-2))
df = df.groupby(['year', 'data-year', 'cfips', 'county', 'state']).mean(numeric_only = True).reset_index()
df = df.filter(items=['year', 'data-year', 'cfips', 'county', 'state', 'microbusiness_density'])
print(df.head())

# Grab all feature files and add to dataframe
for path in os.listdir('./census-data'):
    if not path == 'template':
        table = pd.DataFrame()
        for file in os.listdir('./census-data/' + path + '/parsed'):
            year = file[:4]
            temp = pd.read_csv('./census-data/' + path + '/parsed/' + file)

            temp['data-year'] = year
            table = pd.concat([table, temp])
        df = pd.merge(df, table, how="left", on=['cfips', 'data-year'])

   year data-year  cfips          county    state  microbusiness_density
0  2019      2017   1001  Autauga County  Alabama               2.986972
1  2019      2017   1003  Baldwin County  Alabama               7.370375
2  2019      2017   1005  Barbour County  Alabama               1.046068
3  2019      2017   1007     Bibb County  Alabama               1.278288
4  2019      2017   1009   Blount County  Alabama               1.559113


In [64]:
# Inspect table
df.head()

Unnamed: 0,year,data-year,cfips,county,state,microbusiness_density,pct_house_price_gt1mill,pct_house_price_500k_1mill,pct_house_price_250k_500k,pct_house_price_100k_250k,...,pct_in_arts_industry,pct_in_public_admin_industry,pct_employed_at_self_employed,pct_employed_at_non_profit,pct_employed_at_government,pct_housing_single_detached,pct_broadband,pct_college,pct_in_it_industry,median_hh_inc
0,2019,2017,1001,Autauga County,Alabama,2.986972,0.0,0.0,14.761905,44.761905,...,8.941606,11.152123,8.0,7.8,20.2,72.9,76.6,14.5,1.3,55317.0
1,2019,2017,1003,Baldwin County,Alabama,7.370375,3.076923,7.065527,32.877493,42.962963,...,10.622494,4.838764,11.4,6.7,12.9,73.9,74.5,20.4,1.4,52562.0
2,2019,2017,1005,Barbour County,Alabama,1.046068,2.631579,0.0,18.859649,41.666667,...,6.859653,7.929714,9.9,4.9,19.1,57.2,57.2,7.6,0.5,33368.0
3,2019,2017,1007,Bibb County,Alabama,1.278288,0.0,19.80198,0.0,44.554455,...,3.855097,5.005507,8.1,6.0,17.4,66.8,62.0,8.1,1.2,43404.0
4,2019,2017,1009,Blount County,Alabama,1.559113,2.12766,0.0,14.893617,20.744681,...,4.260992,5.163704,7.9,5.6,11.9,71.3,65.8,8.7,1.3,47412.0


In [65]:
# Check all features present
print('# Features: ' + str(len(df.columns) - 6))
list(df.columns[6:])

# Features: 44


['pct_house_price_gt1mill',
 'pct_house_price_500k_1mill',
 'pct_house_price_250k_500k',
 'pct_house_price_100k_250k',
 'pct_housing_vacant',
 'pct_housing_occupant_ratio_lt1',
 'pct_housing_grapi_gt35',
 'total_population',
 'old_age_dependency_ratio',
 'child_dependency_ratio',
 'median_age',
 'sex_ratio',
 'pct_moved_from_abroad',
 'pct_moved_outside_state',
 'pct_moved_within_state',
 'pct_moved_within_county',
 'pct_households_married',
 'avg_family_size',
 'pct_unmarried_same_sex_households',
 'pct_housing_owner_occupied',
 'pct_divorced',
 'pct_never_married',
 'pct_k12_enrollment',
 'pct_k12_public_students',
 'pct_college_public_students',
 'pct_college_enrollment',
 'pct_with_scieng_degree',
 'pct_with_libarts_degree',
 'pct_with_business_degree',
 'pct_multilingual',
 'pct_below_poverty_level',
 'pct_unemployed',
 'pct_occupation_mbsa',
 'pct_in_finance_industry',
 'pct_in_arts_industry',
 'pct_in_public_admin_industry',
 'pct_employed_at_self_employed',
 'pct_employed_at_no

In [68]:
# Check for any rows with NaN values
df[df.isna().any(axis=1)]

Unnamed: 0,year,data-year,cfips,county,state,microbusiness_density,pct_house_price_gt1mill,pct_house_price_500k_1mill,pct_house_price_250k_500k,pct_house_price_100k_250k,...,pct_in_arts_industry,pct_in_public_admin_industry,pct_employed_at_self_employed,pct_employed_at_non_profit,pct_employed_at_government,pct_housing_single_detached,pct_broadband,pct_college,pct_in_it_industry,median_hh_inc
546,2019,2017,15005,Kalawao County,Hawaii,9.285714,0.0,0.0,0.0,0.0,...,22.222222,34.920635,3.2,1.6,61.9,84.9,60.4,13.9,0.0,61750.0
3681,2020,2018,15005,Kalawao County,Hawaii,10.915493,0.0,0.0,0.0,0.0,...,19.298246,38.596491,3.5,1.8,59.6,85.1,59.6,18.8,0.0,61875.0
4948,2020,2018,35039,Rio Arriba County,New Mexico,2.526204,0.0,5.857741,35.983264,22.594142,...,,,,,,55.1,52.1,12.5,,
6816,2021,2019,15005,Kalawao County,Hawaii,9.358974,0.0,0.0,0.0,0.0,...,21.153846,32.692308,1.9,7.7,55.8,87.2,66.7,19.4,0.0,69375.0
9951,2022,2020,15005,Kalawao County,Hawaii,1.195402,0.0,0.0,0.0,0.0,...,12.705882,65.647059,0.5,0.9,77.6,98.7,97.1,38.3,0.0,76465.0
12045,2022,2020,48243,Jeff Davis County,Texas,5.362546,0.0,0.0,0.0,28.571429,...,18.278146,1.456954,27.5,1.2,26.8,79.0,63.2,14.3,0.0,


In [67]:
# Write out master table to csv
df.to_csv('../master.csv', index=False)