In [1]:
import pandas as pd
import csv
import os

# Parsing Census Data
The goal of this file is to parse data from census tables into an intermediate format so we can merge them into a master table later. 

## Steps:
1. Duplicate the folder (i.e. `S2301`) within `census-data`, and rename the folder to the name of the Census table you are processing
2. Empty the `raw` folder
3. Download census data and place into `raw`. Each year of data should be in its own csv file. Rename each file into `<year>.csv`
4. Update the transformation in the code block below to obtain the desired columns, and rename them if necessary
    - For most tables, you will only need to lookup the column code of the feature you are looking to obtain, and match that to a readable name in `features`
5. Verify the transformation is correct using the `df.head()` statement already placed there for you. If it looks good, the data is good!

In [2]:
def process_file(filename: str):
    year = filename[:filename.find(".")]
    df = pd.read_csv('./raw/' + filename)
    df['cfips'] = df['GEO_ID'].apply(lambda x: x[-5:])

    # Map column codes to human-readable names. 
    # ==============
    features = {
        'S2405_C01_001E': 'workforce_size',
        'S2405_C02_001E': 'pct_occupation_mbsa', # Percentage of workforce in a management, business, science, or arts related role (occupation)
        'S2405_C01_009E': 'pct_in_finance_industry', # Percentage of workforce in the finance, real estate, or insurance industry 
        'S2405_C01_012E': 'pct_in_arts_industry', # Percentage of workforce in the arts, entertainment, recreation, accommodation, or food services industry 
        'S2405_C01_014E': 'pct_in_public_admin_industry' # Percentage of workforce in public administration industry
    }

    df = df.iloc[1:]
    df = df.filter(items=['cfips'] + list(features.keys()))
    df = df.rename(columns=features)
    df['workforce_size'] = df['workforce_size'].apply(lambda x: float(x))
    for feature in list(features.values())[2:]:
        df[feature] = df[feature].apply(lambda x: float(x))
        df[feature] = df[feature]/df['workforce_size'] * 100
    df = df.filter(items=['cfips'] + list(features.values())[1:])
    # ==============

    print(df.head())
    df.to_csv('./parsed/' + year + '.csv', index=False)

for filename in os.listdir("./raw"):
    process_file(filename)

  df = pd.read_csv('./raw/' + filename)


   cfips pct_occupation_mbsa  pct_in_finance_industry  pct_in_arts_industry  \
1  01001                35.3                 5.573988              8.941606   
2  01003                35.7                 7.611112             10.622494   
3  01005                25.0                 3.671998              6.859653   
4  01007                24.4                 4.405825              3.855097   
5  01009                28.5                 5.102900              4.260992   

   pct_in_public_admin_industry  
1                     11.152123  
2                      4.838764  
3                      7.929714  
4                      5.005507  
5                      5.163704  


  df = pd.read_csv('./raw/' + filename)


   cfips pct_occupation_mbsa  pct_in_finance_industry  pct_in_arts_industry  \
1  01001                38.7                 6.259327              8.124689   
2  01003                36.3                 7.398880             10.083638   
3  01005                25.9                 3.967890              5.917431   
4  01007                21.9                 4.580812              4.395604   
5  01009                29.9                 5.837159              4.623817   

   pct_in_public_admin_industry  
1                     10.251202  
2                      5.066450  
3                      7.098624  
4                      5.272256  
5                      4.586339  


  df = pd.read_csv('./raw/' + filename)


   cfips pct_occupation_mbsa  pct_in_finance_industry  pct_in_arts_industry  \
1  01001                37.8                 5.978305              8.783949   
2  01003                37.1                 7.520165              9.971501   
3  01005                26.9                 3.720433              6.085820   
4  01007                20.4                 5.341600              4.673900   
5  01009                30.2                 6.063786              4.562668   

   pct_in_public_admin_industry  
1                      9.436424  
2                      5.224469  
3                      7.012956  
4                      5.138905  
5                      4.539855  
   cfips pct_occupation_mbsa  pct_in_finance_industry  pct_in_arts_industry  \
1  01001                39.0                 5.488202              8.441823   
2  01003                36.8                 7.597602              9.915155   
3  01005                27.4                 3.916389              6.557942   
4  01

  df = pd.read_csv('./raw/' + filename)
