In [100]:
import pandas as pd
import csv
import os

# Parse Master
Parses all census tables via the intermediary form and combines them into a master file with `cfips` and `year`. Also brings in density data from `density.csv`.

In [101]:
# Parse `density.csv` first
df = pd.read_csv('./density.csv')
df['year'] = df['first_day_of_month'].apply(lambda x: int(x[:4]))
df['data-year'] = df['year'].apply(lambda x: str(x-2))
df = df.groupby(['year', 'data-year', 'cfips', 'county', 'state']).mean(numeric_only = True).reset_index()
df = df.filter(items=['year', 'data-year', 'cfips', 'county', 'state', 'microbusiness_density'])
print(df.head())

# Grab all intermediate files and add in
for path in os.listdir('./census-data'):
    if not path == 'template':
        table = pd.DataFrame()
        for file in os.listdir('./census-data/' + path + '/parsed'):
            year = file[:4]
            temp = pd.read_csv('./census-data/' + path + '/parsed/' + file)

            temp['data-year'] = year
            table = pd.concat([table, temp])
        df = pd.merge(df, table, how="left", on=['cfips', 'data-year'])

   year data-year  cfips          county    state  microbusiness_density
0  2019      2017   1001  Autauga County  Alabama               2.986972
1  2019      2017   1003  Baldwin County  Alabama               7.370375
2  2019      2017   1005  Barbour County  Alabama               1.046068
3  2019      2017   1007     Bibb County  Alabama               1.278288
4  2019      2017   1009   Blount County  Alabama               1.559113


In [102]:
# Inspect table
df.head()

Unnamed: 0,year,data-year,cfips,county,state,microbusiness_density,total_population,old_age_dependency_ratio,child_dependency_ratio,median_age,sex_ratio,pct_unemployed,pct_broadband,pct_college,pct_it_workers,median_hh_inc
0,2019,2017,1001,Autauga County,Alabama,2.986972,55036,23.3,40.1,37.8,95.6,5.2,76.6,14.5,1.3,55317.0
1,2019,2017,1003,Baldwin County,Alabama,7.370375,203360,32.3,37.6,42.6,95.9,5.5,74.5,20.4,1.4,52562.0
2,2019,2017,1005,Barbour County,Alabama,1.046068,26201,28.4,34.2,39.7,114.3,12.4,57.2,7.6,0.5,33368.0
3,2019,2017,1007,Bibb County,Alabama,1.278288,22580,23.5,32.7,39.8,118.6,8.2,62.0,8.1,1.2,43404.0
4,2019,2017,1009,Blount County,Alabama,1.559113,57667,29.4,39.8,40.9,97.6,4.9,65.8,8.7,1.3,47412.0


In [107]:
# Check all features present
list(df.columns)

['year',
 'data-year',
 'cfips',
 'county',
 'state',
 'microbusiness_density',
 'total_population',
 'old_age_dependency_ratio',
 'child_dependency_ratio',
 'median_age',
 'sex_ratio',
 'pct_unemployed',
 'pct_broadband',
 'pct_college',
 'pct_it_workers',
 'median_hh_inc']

In [103]:
# Check for any rows with NaN values
df[df.isna().any(axis=1)]

Unnamed: 0,year,data-year,cfips,county,state,microbusiness_density,total_population,old_age_dependency_ratio,child_dependency_ratio,median_age,sex_ratio,pct_unemployed,pct_broadband,pct_college,pct_it_workers,median_hh_inc
4948,2020,2018,35039,Rio Arriba County,New Mexico,2.526204,39307,30.9,40.7,40.6,96.0,,52.1,12.5,,
12045,2022,2020,48243,Jeff Davis County,Texas,5.362546,2245,81.0,31.0,61.3,108.8,0.5,63.2,14.3,0.0,


In [104]:
# Write out master table to csv
df.to_csv('master.csv', index=False)