In [3]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [4]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'S1502_C02_002E': 'pct_with_scieng_degree', 
        'S1502_C02_006E': 'pct_with_libarts_degree', 
        'S1502_C02_004E': 'pct_with_business_degree',
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

  df = pd.read_csv('./raw/' + filename)


   cfips pct_with_scieng_degree pct_with_libarts_degree  \
1  01001                   29.4                    23.9   
2  01003                   27.3                    20.0   
3  01005                   22.9                    18.8   
4  01007                   22.6                    20.6   
5  01009                   22.9                    20.6   

  pct_with_business_degree  
1                     24.7  
2                     22.4  
3                     17.8  
4                     23.1  
5                     17.6  


  df = pd.read_csv('./raw/' + filename)


   cfips pct_with_scieng_degree pct_with_libarts_degree  \
1  01001                   28.6                    23.1   
2  01003                   27.6                    20.1   
3  01005                   20.9                    17.3   
4  01007                   22.4                    19.6   
5  01009                   22.3                    22.7   

  pct_with_business_degree  
1                     25.4  
2                     22.8  
3                     19.7  
4                     28.6  
5                     17.2  


  df = pd.read_csv('./raw/' + filename)


   cfips pct_with_scieng_degree pct_with_libarts_degree  \
1  01001                   27.4                    22.1   
2  01003                   26.4                    21.1   
3  01005                   19.9                    16.4   
4  01007                   23.3                    21.9   
5  01009                   21.3                    21.3   

  pct_with_business_degree  
1                     23.5  
2                     22.8  
3                     21.3  
4                     25.8  
5                     15.7  
   cfips pct_with_scieng_degree pct_with_libarts_degree  \
1  01001                   28.4                    21.3   
2  01003                   28.7                    21.4   
3  01005                   21.6                    18.9   
4  01007                   29.8                    27.6   
5  01009                   17.8                    17.6   

  pct_with_business_degree  
1                     23.4  
2                     23.1  
3                     18.3  


  df = pd.read_csv('./raw/' + filename)
