In [1]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [2]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'S2406_C03_001E': 'pct_self_employed_incorporated', 
        'S2406_C06_001E': 'pct_self_employed_not_incorporated', 
        'S2406_C04_001E': 'pct_employed_at_non_profit',
        'S2406_C05_001E': 'pct_employed_at_government'
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    df['pct_self_employed_incorporated'] = df['pct_self_employed_incorporated'].apply(lambda x: float(x))
    df['pct_self_employed_not_incorporated'] = df['pct_self_employed_not_incorporated'].apply(lambda x: float(x))
    df['pct_employed_at_self_employed'] = df['pct_self_employed_incorporated'] + df['pct_self_employed_not_incorporated']
    df = df.filter(items=['cfips', 'pct_employed_at_self_employed', 'pct_employed_at_non_profit', 'pct_employed_at_government'])
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

   cfips  pct_employed_at_self_employed pct_employed_at_non_profit  \
1  01001                            8.0                        7.8   
2  01003                           11.4                        6.7   
3  01005                            9.9                        4.9   
4  01007                            8.1                        6.0   
5  01009                            7.9                        5.6   

  pct_employed_at_government  
1                       20.2  
2                       12.9  
3                       19.1  
4                       17.4  
5                       11.9  
   cfips  pct_employed_at_self_employed pct_employed_at_non_profit  \
1  01001                            8.0                        8.3   
2  01003                           12.4                        6.7   
3  01005                            9.6                        4.4   
4  01007                            7.8                        6.1   
5  01009                            8.2    