In [6]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [7]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Apply transformations here. Make sure you include the `cfips` column!
    # ==============
    features = {
        'S0101_C01_001E': 'total_population', 
        'S0101_C01_035E': 'old_age_dependency_ratio', 
        'S0101_C01_036E': 'child_dependency_ratio', 
        'S0101_C01_032E': 'median_age',
        'S0101_C01_033E': 'sex_ratio'
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

  df = pd.read_csv('./raw/' + filename)


   cfips total_population old_age_dependency_ratio child_dependency_ratio  \
1  01001            55036                     23.3                   40.1   
2  01003           203360                     32.3                   37.6   
3  01005            26201                     28.4                   34.2   
4  01007            22580                     23.5                   32.7   
5  01009            57667                     29.4                   39.8   

  median_age sex_ratio  
1       37.8      95.6  
2       42.6      95.9  
3       39.7     114.3  
4       39.8     118.6  
5       40.9      97.6  


  df = pd.read_csv('./raw/' + filename)


   cfips total_population old_age_dependency_ratio child_dependency_ratio  \
1  01001            55200                     23.8                   39.6   
2  01003           208107                     33.4                   37.5   
3  01005            25782                     29.5                   34.6   
4  01007            22527                     25.8                   32.8   
5  01009            57645                     30.1                   39.7   

  median_age sex_ratio  
1       37.8      94.9  
2       42.8      94.6  
3       39.9     113.3  
4       39.9     117.1  
5       40.8      97.3  


  df = pd.read_csv('./raw/' + filename)


   cfips total_population old_age_dependency_ratio child_dependency_ratio  \
1  01001            55380                     24.4                   39.0   
2  01003           212830                     34.3                   37.3   
3  01005            25361                     30.7                   34.6   
4  01007            22493                     25.1                   32.4   
5  01009            57681                     30.4                   39.4   

  median_age sex_ratio  
1       38.2      94.7  
2       43.0      94.7  
3       40.4     112.4  
4       40.9     117.5  
5       40.7      97.6  
   cfips total_population old_age_dependency_ratio child_dependency_ratio  \
1  01001            55639                     25.0                   38.6   
2  01003           218289                     35.3                   37.1   
3  01005            25026                     31.8                   34.8   
4  01007            22374                     26.0                   32.5   
5 

  df = pd.read_csv('./raw/' + filename)
