In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from typing import List, Tuple

MYDIR = "./../../ResearchProposal/income_inequality/census_income_by_county/"
print("Ready")

Ready


In [9]:
def buildFileName(MYDIR: str, YEAR: int) -> str:
    return MYDIR + "ACSST5Y"+ str(YEAR) + ".S1901_data_with_overlays_2020-07-11T163216.csv"

def identifyMyFiles(MYDIR: str) -> List[str]:

    allFiles = os.listdir(MYDIR)
    myFiles = [ff for ff in allFiles if "data_with_overlays" in ff]
    return myFiles

def unifyColNames(hhDF: pd.DataFrame) -> pd.DataFrame:
    """
    Unify column names across years
    """
    cols = list(hhDF.columns)
    for cc in range(len(cols)):
        if "Households__Estimate" in cols[cc]:
            cols[cc] = cols[cc].replace("Households__Estimate", "Estimate__Households")
        if "Families__Estimate" in cols[cc]:
            cols[cc] = cols[cc].replace("Families__Estimate", "Estimate__Families")
        if "Married_couple_families__Estimate" in cols[cc]:
            cols[cc] = cols[cc].replace("Married_couple_families__Estimate", "Estimate__Married_couple_families")

    for cc in range(len(cols)):
        if "Estimate__Households" in cols[cc]:
            cols[cc] = cols[cc].replace("Estimate__Households", "")
        if "Estimate__Families" in cols[cc]:
            cols[cc] = cols[cc].replace("Estimate__Families", "Families")
        if "Estimate__Married_couple_families" in cols[cc]:
            cols[cc] = cols[cc].replace("Estimate__Married_couple_families", "Married_couple_families")

    hhDF.columns = cols
    return hhDF

def processCensusIncomeInequalityData(MYDIR: str, YEAR: int) -> Tuple[str, pd.DataFrame]:
    
    """
    Read and focus the data
    """
    myFile = buildFileName(MYDIR, YEAR)
    
    myFiles = identifyMyFiles(MYDIR)
    if myFile not in [MYDIR + ff for ff in myFiles]:
        print(f"""{myFile} not found in the files listed for {MYDIR}""")
        return None, None

    print(f"""Reading data from {myFile}""")

    incomeDF = pd.read_csv(myFile, header=1)

    incomeDF["county_state"] = incomeDF["Geographic Area Name"].str.replace(" County,", ",")
    incomeDF[["county", "state"]] = incomeDF["county_state"].str.split(", ", expand=True)

    estimatesDF = incomeDF[[cc for cc in incomeDF.columns if "Estimate" in cc]].copy()
    estimatesDF["id"] = incomeDF["id"]
    estimatesDF["Geographic Area Name"] = incomeDF["Geographic Area Name"]

    estimatesDF.columns = [cc.replace("!!", "__") for cc in estimatesDF.columns]
    estimatesDF.columns = [cc.replace("-", "_") for cc in estimatesDF.columns]
    estimatesDF.columns = [cc.replace(" ", "_") for cc in estimatesDF.columns]
    estimatesDF.columns = [cc.replace("$", "USD") for cc in estimatesDF.columns]
    estimatesDF.columns = [cc.replace("(", "_") for cc in estimatesDF.columns]
    estimatesDF.columns = [cc.replace(")", "_") for cc in estimatesDF.columns]
    myIncomeDF = estimatesDF[[cc for cc in estimatesDF.columns if "PERCENT_ALLOCATED" not in cc]].copy()
    myIncomeDF.head()

    hhDF = myIncomeDF[[cc for cc in myIncomeDF.columns if "household" in cc.lower()]].copy()
    hhDF = hhDF[[cc for cc in hhDF.columns if "nonfamily" not in cc.lower()]].copy()
    hhDF = hhDF[[cc for cc in hhDF.columns if "families" not in cc.lower()]].copy()
    hhDF = hhDF[[cc for cc in hhDF.columns if "total_" not in cc.lower()]].copy()

    hhDF = hhDF.replace("(X)", np.nan)
    #     hhDF.dropna(inplace=True)    
        
    hhDF["id"] = incomeDF["id"]
    hhDF["county"] = incomeDF["county"]
    hhDF["state"] = incomeDF["state"]
    hhDF["county_state"] = incomeDF["county_state"]

    hhDF = unifyColNames(hhDF)
    for cc in ["__Mean_income__dollars_",
               "__Median_income__dollars_"]:
        hhDF[cc] = hhDF[cc].astype("float64")

    hhDF["mean_to_median_household_income_ratio"] = \
        hhDF["__Mean_income__dollars_"] /\
        hhDF["__Median_income__dollars_"]

    hhDF["year"] = YEAR
    hhDF.head()
    
    cols = list(hhDF.columns)
    for cc in range(len(cols)):
        cols[cc] = cols[cc].replace("_", " ")
        cols[cc] = cols[cc].strip()
        cols[cc] = cols[cc].replace(" ", "_")
        cols[cc] = cols[cc].replace(",", "")
    hhDF.columns = cols
    
    csvFile = myFile.replace("ACSST5Y", "hh_income_")
    csvFile = csvFile.replace(".S1901_data_with_overlays_2020-07-11T163216", "_census_data")

    hhDF.to_csv(csvFile, index=False)
    print(f"""Saved hhDF ({len(hhDF)} rows) to {csvFile}""")
    
    return csvFile, hhDF

In [10]:
hhDFallHT = {} # Dict[int, pd.DataFrame]
for year in range(2010, 2019):
    print(f"""{year}""")
    csvYR, hhDFyr = processCensusIncomeInequalityData(MYDIR, year)
    hhDFallHT[year] = hhDFyr

2010
Reading data from ./../../ResearchProposal/income_inequality/census_income_by_county/ACSST5Y2010.S1901_data_with_overlays_2020-07-11T163216.csv
Saved hhDF (3222 rows) to ./../../ResearchProposal/income_inequality/census_income_by_county/hh_income_2010_census_data.csv
2011
Reading data from ./../../ResearchProposal/income_inequality/census_income_by_county/ACSST5Y2011.S1901_data_with_overlays_2020-07-11T163216.csv
Saved hhDF (3222 rows) to ./../../ResearchProposal/income_inequality/census_income_by_county/hh_income_2011_census_data.csv
2012
Reading data from ./../../ResearchProposal/income_inequality/census_income_by_county/ACSST5Y2012.S1901_data_with_overlays_2020-07-11T163216.csv
Saved hhDF (3222 rows) to ./../../ResearchProposal/income_inequality/census_income_by_county/hh_income_2012_census_data.csv
2013
Reading data from ./../../ResearchProposal/income_inequality/census_income_by_county/ACSST5Y2013.S1901_data_with_overlays_2020-07-11T163216.csv
Saved hhDF (3222 rows) to ./../.

In [11]:
hhIncomeDF = pd.concat(hhDFallHT, ignore_index=True, sort=False)
# print(hhIncomeDF.columns)
hhIncomeDF.head()

csvOut = buildFileName(MYDIR, "")
csvFile = csvOut.replace("ACSST5Y", "hh_income_")
csvFile = csvFile.replace(".S1901_data_with_overlays_2020-07-11T163216", "_census_data")

hhIncomeDF.to_csv(csvFile, index=False)
print(f"""Saved hhIncomeDF ({len(hhIncomeDF)} rows) to {csvFile}""")
hhIncomeDF.head()

Saved hhIncomeDF (28993 rows) to ./../../ResearchProposal/income_inequality/census_income_by_county/hh_income__census_data.csv


Unnamed: 0,Total,Less_than_USD10000,USD10000_to_USD14999,USD15000_to_USD24999,USD25000_to_USD34999,USD35000_to_USD49999,USD50000_to_USD74999,USD75000_to_USD99999,USD100000_to_USD149999,USD150000_to_USD199999,...,Median_income__dollars,Mean_income__dollars,PERCENT_IMPUTED__Household_income_in_the_past_12_months,PERCENT_IMPUTED__Family_income_in_the_past_12_months,id,county,state,county_state,mean_to_median_household_income_ratio,year
0,8626.0,5.0,4.3,11.1,10.6,15.1,22.6,15.7,11.6,2.6,...,53470.0,62728.0,28.4,,0500000US17083,Jersey,Illinois,"Jersey, Illinois",1.173144,2010
1,10001.0,5.6,5.2,10.3,13.2,15.4,21.6,14.4,9.3,3.2,...,50279.0,61591.0,24.9,,0500000US17085,Jo Daviess,Illinois,"Jo Daviess, Illinois",1.224985,2010
2,4396.0,10.6,5.5,14.5,12.9,16.5,20.5,11.1,6.6,1.3,...,41619.0,48435.0,32.3,,0500000US17087,Johnson,Illinois,"Johnson, Illinois",1.163771,2010
3,168980.0,3.6,3.1,7.7,8.2,13.3,18.8,15.0,17.2,7.1,...,67767.0,86358.0,24.6,,0500000US17089,Kane,Illinois,"Kane, Illinois",1.274337,2010
4,40943.0,6.4,6.5,11.2,11.2,14.4,20.1,14.4,11.1,3.0,...,50484.0,61006.0,29.9,,0500000US17091,Kankakee,Illinois,"Kankakee, Illinois",1.208422,2010
