In [1]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [6]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'S1401_C04_003E': 'pct_k12_public_students', # proportion of students in grade school enrolled in a public school
        'S1401_C04_010E': 'pct_college_public_students', # proportion of students in college (undergrad & grad) enrolled in a public school
        'S1401_C02_030E': 'pct_college_enrollment',  # pct of young adults 18 - 24 enrolled in college
        'S1401_C01_015E': '5_to_9_population',
        'S1401_C01_016E': '5_to_9_enrollment',
        'S1401_C01_017E': '10_to_14_population',
        'S1401_C01_018E': '10_to_14_enrollment',
        'S1401_C01_019E': '15_to_17_population',
        'S1401_C01_020E': '15_to_17_enrollment',
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    for feature in list(features.values())[3:]:
        df[feature] = df[feature].apply(lambda x: int(x))
    df['pct_k12_enrollment'] = 100*(df['5_to_9_enrollment'] + df['10_to_14_enrollment'] + df['15_to_17_enrollment'])/(df['5_to_9_population'] + df['10_to_14_population'] + df['15_to_17_population'])
    df = df.filter(items=['cfips', 'pct_k12_enrollment'] + list(features.values())[:3])
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

  df = pd.read_csv('./raw/' + filename)


   cfips  pct_k12_enrollment pct_k12_public_students  \
1  01001           96.152734                    86.0   
2  01003           95.476667                    85.7   
3  01005           96.321726                    90.9   
4  01007           97.158597                    90.6   
5  01009           93.693515                    93.5   

  pct_college_public_students pct_college_enrollment  
1                        76.1                   23.0  
2                        79.5                   27.0  
3                        93.3                   18.9  
4                        96.6                   33.5  
5                        86.7                   25.2  


  df = pd.read_csv('./raw/' + filename)


   cfips  pct_k12_enrollment pct_k12_public_students  \
1  01001           96.348704                    85.2   
2  01003           96.336738                    85.1   
3  01005           95.996045                    89.1   
4  01007           98.758865                    90.8   
5  01009           93.408795                    93.1   

  pct_college_public_students pct_college_enrollment  
1                        80.9                   26.2  
2                        81.6                   28.4  
3                        88.6                   18.5  
4                        97.4                   25.5  
5                        81.6                   23.4  


  df = pd.read_csv('./raw/' + filename)


   cfips  pct_k12_enrollment pct_k12_public_students  \
1  01001           95.494594                    87.1   
2  01003           96.193481                    83.5   
3  01005           95.603840                    88.8   
4  01007           99.246080                    90.5   
5  01009           92.246661                    91.7   

  pct_college_public_students pct_college_enrollment  
1                        81.1                   30.0  
2                        82.3                   27.8  
3                        86.2                   20.5  
4                        94.5                   17.1  
5                        85.6                   25.1  
   cfips  pct_k12_enrollment pct_k12_public_students  \
1  01001           94.721437                    85.3   
2  01003           94.593669                    82.4   
3  01005           97.660668                    89.2   
4  01007           98.949264                    85.0   
5  01009           93.807642                    91.2 

  df = pd.read_csv('./raw/' + filename)
