In [1]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [2]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'S0701_C05_001E': 'pct_moved_from_abroad', 
        'S0701_C04_001E': 'pct_moved_outside_state', 
        'S0701_C03_001E': 'pct_moved_within_state', 
        'S0701_C02_001E': 'pct_moved_within_county',
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

  df = pd.read_csv('./raw/' + filename)


   cfips pct_moved_from_abroad pct_moved_outside_state pct_moved_within_state  \
1  01001                   0.2                     3.2                    4.9   
2  01003                   0.3                     3.1                    2.1   
3  01005                   0.2                     1.3                    3.4   
4  01007                   0.4                     0.4                    7.3   
5  01009                   0.2                     0.8                    2.6   

  pct_moved_within_county  
1                     7.1  
2                     6.2  
3                     7.7  
4                     4.4  
5                     2.3  


  df = pd.read_csv('./raw/' + filename)


   cfips pct_moved_from_abroad pct_moved_outside_state pct_moved_within_state  \
1  01001                   0.2                     3.3                    4.5   
2  01003                   0.3                     2.9                    2.3   
3  01005                   0.3                     1.7                    3.3   
4  01007                   0.1                     0.5                    7.3   
5  01009                   0.2                     0.9                    2.2   

  pct_moved_within_county  
1                     6.0  
2                     6.0  
3                    10.1  
4                     3.4  
5                     2.9  


  df = pd.read_csv('./raw/' + filename)


   cfips pct_moved_from_abroad pct_moved_outside_state pct_moved_within_state  \
1  01001                   0.2                     2.8                    3.7   
2  01003                   0.4                     2.9                    2.3   
3  01005                   0.3                     1.9                    3.1   
4  01007                   0.1                     1.1                    5.6   
5  01009                   0.2                     0.8                    2.0   

  pct_moved_within_county  
1                     5.4  
2                     6.2  
3                     9.3  
4                     3.5  
5                     3.5  
   cfips pct_moved_from_abroad pct_moved_outside_state pct_moved_within_state  \
1  01001                   0.5                     2.5                    3.7   
2  01003                   0.3                     3.0                    2.0   
3  01005                   0.3                     1.7                    4.5   
4  01007             

  df = pd.read_csv('./raw/' + filename)
