In [10]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [11]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'B05001_001E': 'total_population', 
        'B05001_002E': 'us_born', 
        'B05001_003E': 'us_pr_born', 
        'B05001_004E': 'us_abroad_born',
        'B05001_005E': 'us_naturalized',
        'B05001_006E': 'us_not_citizen'
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    df = df.dropna()
    for feature in features.values():
        df[feature] = df[feature].apply(lambda x: 0 if x == 'null' else int(x))
    df['pct_born_us_citizen'] = 100*(df['us_born'] + df['us_pr_born'] + df['us_abroad_born'])/df['total_population']
    df['pct_naturalized_us_citizen'] = 100*df['us_naturalized']/df['total_population']
    df['pct_not_us_citizen'] = 100*df['us_not_citizen']/df['total_population']
    df = df.filter(items=['cfips', 'pct_born_us_citizen', 'pct_naturalized_us_citizen', 'pct_not_us_citizen'])
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

   cfips  pct_born_us_citizen  pct_naturalized_us_citizen  pct_not_us_citizen
1  01001            97.886838                    1.115633            0.997529
2  01003            96.751574                    1.559304            1.689123
3  01005            97.339796                    1.003778            1.656425
4  01007            98.976971                    0.181577            0.841453
5  01009            95.454940                    1.576291            2.968769
   cfips  pct_born_us_citizen  pct_naturalized_us_citizen  pct_not_us_citizen
1  01001            97.981884                    1.048913            0.969203
2  01003            96.554657                    1.594853            1.850490
3  01005            97.494376                    1.086029            1.419595
4  01007            98.557287                    0.572646            0.870067
5  01009            95.640559                    1.519646            2.839795
   cfips  pct_born_us_citizen  pct_naturalized_us_citizen  pct_n