In [1]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [2]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'S1601_C02_003E': 'pct_multilingual'
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

  df = pd.read_csv('./raw/' + filename)


   cfips pct_multilingual
1  01001              3.8
2  01003              5.5
3  01005              5.7
4  01007              2.2
5  01009              7.7


  df = pd.read_csv('./raw/' + filename)


   cfips pct_multilingual
1  01001              3.8
2  01003              5.4
3  01005              6.1
4  01007              2.4
5  01009              8.0


  df = pd.read_csv('./raw/' + filename)


   cfips pct_multilingual
1  01001              4.3
2  01003              5.4
3  01005              7.1
4  01007              2.3
5  01009              8.2
   cfips pct_multilingual
1  01001              3.8
2  01003              5.0
3  01005              7.2
4  01007              2.4
5  01009              7.8


  df = pd.read_csv('./raw/' + filename)
