In [12]:
# Dependencies
import pandas as pd
import numpy as np
import requests
from pathlib import Path
import matplotlib.pyplot as plt
from scipy.stats import linregress

# US Census setup
from census import Census
from us import states
from config import census_key

In [28]:
census_library = Census(census_key, year=2021)
state_code = states.CA.fips
variables = [
    'NAME',
    'B01003_001E',  # Population
    'B25106_002E',  # Housing units for owners
    'B25106_003E',  # Housing units for owners with income <20K
    'B25106_006E',  # Housing units for owners with income <20K, housing costs >=30% of income
    'B25106_007E',  # Housing units for owners with income 20-35K
    'B25106_010E',  # Housing units for owners with income 20-35K, housing costs >=30% of income
    'B25106_011E',  # Housing units for owners with income 35-50K
    'B25106_014E',  # Housing units for owners with income 35-50K, housing costs >=30% of income
    'B25106_024E',  # Housing units for renters
    'B25106_025E',  # Housing units for renters with income <20K
    'B25106_028E',  # Housing units for renters with income <20K, housing costs >=30% of income
    'B25106_029E',  # Housing units for renters with income 20-35K
    'B25106_032E',  # Housing units for renters with income 20-35K, housing costs >=30% of income
    'B25106_033E',  # Housing units for renters with income 35-50K
    'B25106_036E',  # Housing units for renters with income 35-50K, housing costs >=30% of income
]

# Retrieve data for all counties in California
data = census_library.acs5.state_county(variables, state_code, Census.ALL)

# Convert to dataframe
housing_df = pd.DataFrame(data)

# Print the DataFrame
housing_df.head()

Unnamed: 0,NAME,B01003_001E,B25106_002E,B25106_003E,B25106_006E,B25106_007E,B25106_010E,B25106_011E,B25106_014E,B25106_024E,B25106_025E,B25106_028E,B25106_029E,B25106_032E,B25106_033E,B25106_036E,state,county
0,"Alameda County, California",1673133.0,313410.0,12117.0,10897.0,15122.0,10589.0,14914.0,8546.0,268273.0,33224.0,29275.0,26673.0,24128.0,23024.0,19821.0,6,1
1,"Alpine County, California",1344.0,355.0,20.0,10.0,41.0,7.0,17.0,6.0,79.0,4.0,4.0,23.0,2.0,6.0,2.0,6,3
2,"Amador County, California",40095.0,12075.0,995.0,870.0,1230.0,685.0,1513.0,797.0,3319.0,568.0,513.0,449.0,449.0,393.0,269.0,6,5
3,"Butte County, California",217884.0,49549.0,4388.0,3418.0,5638.0,2988.0,4819.0,1809.0,35743.0,7396.0,6970.0,6850.0,6126.0,4921.0,3133.0,6,7
4,"Calaveras County, California",45349.0,13698.0,1220.0,1033.0,1458.0,1005.0,1595.0,867.0,3111.0,584.0,550.0,443.0,384.0,367.0,334.0,6,9


In [29]:
# Add columns rolling up owner and renter columns
housing_df['Total Housing Units'] = housing_df['B25106_002E'] + housing_df['B25106_024E']
housing_df['Units Income < 50K'] = (housing_df['B25106_003E'] + housing_df['B25106_007E'] +
                                    housing_df['B25106_011E'] + housing_df['B25106_025E'] +
                                    housing_df['B25106_029E'] + housing_df['B25106_033E']
                                   )
housing_df['Units Costs >= 30%'] = (housing_df['B25106_006E'] + housing_df['B25106_010E'] +
                                    housing_df['B25106_014E'] + housing_df['B25106_028E'] +
                                    housing_df['B25106_032E'] + housing_df['B25106_036E']
                                   )

# Add columns to calculate % of units with costs >= 30%
housing_df['% Units Inc<50K Costs>=30%'] = housing_df['Units Costs >= 30%'] / housing_df['Units Income < 50K'] * 100
housing_df['% Units Costs>=30%'] = housing_df['Units Costs >= 30%'] / housing_df['Total Housing Units'] * 100

# Remove unnessary columns
# https://sparkbyexamples.com/pandas/pandas-drop-columns-by-index/
housing_df = housing_df.drop(housing_df.iloc[:, 2:18], axis = 1)# Rename columns - if I want to keep all columns

housing_df['NAME'] = housing_df['NAME'].replace(', California','')

# Rename existing columns
housing_df = housing_df.rename(
    columns = {
        'NAME': 'County',
        'B01003_001E': 'Population',
    }
)

# Remove California from County
housing_df['County'] = housing_df['County'].str.replace(', California','')

housing_df.head()

Unnamed: 0,County,Population,Total Housing Units,Units Income < 50K,Units Costs >= 30%,% Units Inc<50K Costs>=30%,% Units Costs>=30%
0,Alameda County,1673133.0,581683.0,125074.0,103256.0,82.555927,17.751249
1,Alpine County,1344.0,434.0,111.0,31.0,27.927928,7.142857
2,Amador County,40095.0,15394.0,5148.0,3583.0,69.599845,23.275302
3,Butte County,217884.0,85292.0,34012.0,24444.0,71.868752,28.659194
4,Calaveras County,45349.0,16809.0,5667.0,4173.0,73.636845,24.825986


In [None]:
coc_mapping age_data = Path('resources/experiencing_homelessness_age_demographics.csv')

In [26]:
url_path = Path('Resources/experiencing_homelessness_ethnicity.csv')
homeless_df = pd.read_csv(url_path)

homeless_df.head()

homeless_df.dtypes

CALENDAR_YEAR                 int64
COC_ID                       object
COC_NAME                     object
ETHNICITY                    object
EXPERIENCING_HOMELESSNESS    object
dtype: object

In [33]:
# Change homeless count data type to integer
homeless_df['EXPERIENCING_HOMELESSNESS'] = homeless_df["EXPERIENCING_HOMELESSNESS"].str.replace("*", "0")
homeless_df['EXPERIENCING_HOMELESSNESS'] = homeless_df["EXPERIENCING_HOMELESSNESS"].astype(int)

# Group by year and COC Name and sum homeless counts
homeless_count_df = clean_age_df.groupby(['CALENDAR_YEAR', 'COC_NAME'])['EXPERIENCING_HOMELESSNESS'].sum().reset_index()
homeless_count_df

  homeless_df['EXPERIENCING_HOMELESSNESS'] = homeless_df["EXPERIENCING_HOMELESSNESS"].str.replace("*", "0")


ValueError: cannot convert float NaN to integer

In [None]:
# Housing affordability index 
housing_hai_df = pd.read_excel("resources/housing_hai_data.xlsx")

housing_hai_df