In [1]:
import pandas as pd
import numpy as np

In [3]:
chas = pd.read_csv('assets\ACS_5YR_CHAS_Estimate_Data_by_County_-4190529618979970776.csv')
chas.shape

(3221, 408)

In [5]:
#by reading the Data Dictionary I found that here's the data most related to our work

# We use this to join data
# GEOID - Geographic Identifier - fully concatenated geographic code (State FIPS and County FIPS)
# STATE - Census 2-digit FIPS State Code
# COUNTY - 3-digit County FIPS code
# NAME - COUNTY NAME

# T2_EST1 - Total Households

# Cost-burden county by Department of house and urban Development(HUD)
# T8_EST69 , T8_EST82, T8_EST95 , T8_EST108 - Contain cost-burdened household for =< 30% , 30-50% ,50-80% and all 

# already made percentage 
# T8_CB_PCT - Percent of all households with cost burden over 30% 
# T8_CB50_PCT - Percent of all households with cost burden over 30%
# T8_LE30_CB_PCT, T8 LE30_CB50_PCT - Percent of extremely low income households (household income less than or equal to 30% HAMFI) with cost burden over 30%
# T8_LE50_CB_PCT, T8_LE50_CB50_PCT
# T8_LE80_CB_PCT, T8_LE80_CB_50_PCT 

# renter / owner break-outs - to see who's hurting
# T7_SF_CB_R - Renter occupied AND household type is small family (2 persons, neither person 62 years or over, or 3 or 4 persons) AND housing cost burden is greater than 30%
#  T7_LF_CB_R  - Renter occupied AND household type is large family (5 or more persons) AND housing cost burden is greater than 30%
# T7_SF_CB_O - Owner occupied AND household type is small family (2 persons, neither person 62 years or over, or 3 or 4 persons) AND housing cost burden is greater than 30%
#  T7_LF_CB_O - Owner occupied AND household type is large family (5 or more persons) AND housing cost burden is greater than 30%


columns_to_keep = [
    "GEOID", "STATE", "COUNTY", "NAME",
    "T2_EST1",        
    "T8_EST69",      
    "T8_EST82",        
    "T8_EST95",         
    "T8_EST108",    
    "T8_LE30_CB_PCT", "T8_LE30_CB50_PCT",
    "T8_LE50_CB_PCT", "T8_LE50_CB50_PCT",
    "T8_LE80_CB_PCT", "T8_LE80_CB50_PCT",
    "T8_CB_PCT", "T8_CB50_PCT",
    "T7_SF_CB_R", "T7_LF_CB_R", "T7_SF_CB_O", "T7_LF_CB_O",
]
chas = chas[columns_to_keep].copy()


In [6]:
#check for missing value
subset = chas[columns_to_keep]
missing_counts   = subset.isna().sum()   

missing_counts

GEOID               0
STATE               0
COUNTY              0
NAME                0
T2_EST1             0
T8_EST69            0
T8_EST82            0
T8_EST95            0
T8_EST108           0
T8_LE30_CB_PCT      1
T8_LE30_CB50_PCT    1
T8_LE50_CB_PCT      1
T8_LE50_CB50_PCT    1
T8_LE80_CB_PCT      0
T8_LE80_CB50_PCT    0
T8_CB_PCT           0
T8_CB50_PCT         0
T7_SF_CB_R          0
T7_LF_CB_R          0
T7_SF_CB_O          0
T7_LF_CB_O          0
dtype: int64

In [7]:
colums_renames = {

    "GEOID"        : "GEOID",
    "STATE"        : "STATE",
    "COUNTY"       : "COUNTY",
    "NAME"         : "NAME",

    "T2_EST1"      : "total_households",
    "T8_EST69"     : "cost_burdened_households_leq_30_percent",
    "T8_EST82"     : "cost_burdened_households_30_to_50_percent",
    "T8_EST95"     : "cost_burdened_households_50_to_80_percent",
    "T8_EST108"    : "cost_burdened_households_all_income_levels_count",

    "T8_CB_PCT"        : "Percent_of_all_with_cb_over30_percent",
    "T8_CB50_PCT"      : "Percent_of_all_with_cb_over50_percent",
    
    "T8_LE30_CB_PCT"   : "Percent_of_low_income_cb_over30_percent",
    "T8_LE30_CB50_PCT" : "Percent_of_low_income_cb_over50_percent",

    "T8_LE50_CB_PCT"   : "Percent_of_medi_income_cb_over30_percent",
    "T8_LE50_CB50_PCT" : "Percent_of_medi_income_cb_over50_percent",

    "T8_LE80_CB_PCT"   : "Percent_of_high_income_cb_over30_percent",
    "T8_LE80_CB50_PCT" : "Percent_of_high_income_cb_over50_percent",

    "T7_SF_CB_R"   : "Percent_renter_small_family_count_cb_over30",
    "T7_LF_CB_R"   : "Percent_renter_large_family_count_cb_over50",
    "T7_SF_CB_O"   : "Percent_owner_small_family_count_cb_over30",
    "T7_LF_CB_O"   : "Percent_owner_large_family_count_cb_over50",
}

chas.rename(columns= colums_renames, inplace=True)

In [8]:
print(chas.isna().sum().sort_values(ascending=False).head(10))
print(chas.describe().T[['min','max','mean']])

Percent_of_low_income_cb_over50_percent        1
Percent_of_medi_income_cb_over30_percent       1
Percent_of_low_income_cb_over30_percent        1
Percent_of_medi_income_cb_over50_percent       1
Percent_owner_small_family_count_cb_over30     0
Percent_renter_large_family_count_cb_over50    0
Percent_renter_small_family_count_cb_over30    0
Percent_of_all_with_cb_over50_percent          0
Percent_of_all_with_cb_over30_percent          0
Percent_of_high_income_cb_over50_percent       0
dtype: int64
                                                     min         max  \
GEOID                                             1001.0    72153.00   
STATE                                                1.0       72.00   
COUNTY                                               1.0      840.00   
total_households                                    60.0  3332505.00   
cost_burdened_households_leq_30_percent              0.0   531195.00   
cost_burdened_households_30_to_50_percent            0.0   331775

In [9]:
# chas.to_csv('clean_data/clean_chas.csv', index= False)