# Setup and Data Loading
Import required libraries including pandas, numpy, census API wrapper and establish API connections for Census and HUD data.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from census import Census
from us import states
import requests
import os

# Establish API connections
CENSUS_API_KEY = os.getenv("CENSUS_API_KEY")
HUD_API_KEY = os.getenv("HUD_API_KEY")

census = Census(CENSUS_API_KEY)

# Function to get HUD data
def get_hud_data(endpoint, params):
    base_url = "https://www.huduser.gov/hudapi/public/"
    headers = {"Authorization": f"Bearer {HUD_API_KEY}"}
    response = requests.get(f"{base_url}{endpoint}", headers=headers, params=params)
    return response.json()

# Example usage of get_hud_data function
hud_params = {"year": 2023, "state": "CO"}
hud_data = get_hud_data("income_limits", hud_params)

# Census API Data Collection
Query the Census API using census-api-wrapper to collect Median Family Income data for years 2005-2023, handling the 2020 gap year.

In [None]:
# Define the years to query
acs_years = list(range(2005, 2020)) + list(range(2021, 2024))

# Function to get ACS data for a given year
def get_acs_data(year):
    return census.acs1.state(('B19113_001E'), states.CO.fips, year=year)

# Collect ACS data for all years
acs_data = []
for year in acs_years:
    data = get_acs_data(year)
    for entry in data:
        entry['year'] = year
    acs_data.extend(data)

# Convert the data to a DataFrame
acs_df = pd.DataFrame(acs_data)

# Rename columns for clarity
acs_df.rename(columns={'B19113_001E': 'median_family_income'}, inplace=True)

# Save the DataFrame to a CSV file
acs_df.to_csv('data/acs_mfi.csv', index=False)

# Display the DataFrame
acs_df.head()

# Historical MFI Data Processing
Process historical MFI data from 2000-2004 using pandas to read and transform CSV data to match the API format.

In [None]:
# Process historical MFI data from 2000-2004 using pandas to read and transform CSV data to match the API format

# Read the historical MFI data from CSV
historical_mfi_df = pd.read_csv("MultiYearProfiles0402004.csv")

# Filter the data for "Median family income (dollars)" and "Colorado"
historical_mfi_df = historical_mfi_df[
    (historical_mfi_df['Stub'] == "Median family income (dollars)") & 
    (historical_mfi_df['Geographic Name'] == "Colorado")
]

# Select relevant columns and pivot the data to match the API format
historical_mfi_df = historical_mfi_df.melt(
    id_vars=["Geographic Name"], 
    value_vars=[col for col in historical_mfi_df.columns if "Estimate" in col],
    var_name="year", 
    value_name="estimate"
)

# Extract the year from the column names
historical_mfi_df['year'] = historical_mfi_df['year'].str.extract(r'(\d{4})').astype(int)

# Rename columns to match the API format
historical_mfi_df.rename(columns={"Geographic Name": "NAME", "estimate": "median_family_income"}, inplace=True)

# Append the historical data to the ACS data
acs_df = pd.concat([acs_df, historical_mfi_df[['year', 'median_family_income']]], ignore_index=True)

# Save the combined DataFrame to a CSV file
acs_df.to_csv('data/acs_mfi_combined.csv', index=False)

# Display the combined DataFrame
acs_df.head()

# IPUMS Data Processing
Use ipumspy package to fetch and process PUMS data, including household characteristics and income variables.

In [None]:
# Import required libraries for IPUMS data processing
import ipumspy
from ipumspy import IpumsApiClient, UsaExtract
import duckdb

# Initialize IPUMS API client
ipums_api_key = os.getenv("IPUMS_API_KEY")
client = IpumsApiClient(ipums_api_key)

# Define the variables to extract from IPUMS
variables = [
    "YEAR", "STATEFIP", "COUNTYFIP", "PUMA", "NUMPREC", "CPI99", "OWNERSHP", "OWNCOST", 
    "RENT", "HHTYPE", "HHINCOME", "INCTOT", "INCWAGE", "INCBUS00", "INCSS", "INCWELFR", 
    "INCINVST", "INCSUPP", "INCOTHER", "INCEARN", "POVERTY", "CBPOVERTY", "AGE", "FAMUNIT", 
    "FAMSIZE", "NCHILD", "NCHLT5", "FTOTINC", "BEDROOMS", "REPWT"
]

# Define the samples to extract (ACS 1-year samples from 2000 to 2023)
samples = [f"us{year}a" for year in range(2000, 2024)]

# Create the extract request
extract = UsaExtract(
    collection="usa",
    description="ACS 1 year samples in Colorado of income variables, all samples since 2000",
    samples=samples,
    variables=variables
)

# Submit the extract request and wait for it to complete
extract.submit(client)
extract.wait_for_extract(client)

# Download the extract data
extract.download(client, path="data/ipums_extract.zip")

# Read the IPUMS data into a DataFrame
ipums_data = ipumspy.read_ipums_micro("data/ipums_extract.zip")

# Convert the IPUMS data to a DuckDB database for efficient querying
duckdb_conn = duckdb.connect("data/ipums_data.duckdb")
duckdb_conn.execute("CREATE TABLE ipums_data AS SELECT * FROM ipums_data")

# Query the DuckDB database to filter and process the data
query = """
    SELECT 
        YEAR, STATEFIP, COUNTYFIP, PUMA, NUMPREC, CPI99, OWNERSHP, OWNCOST, RENT, HHTYPE, 
        HHINCOME, INCTOT, INCWAGE, INCBUS00, INCSS, INCWELFR, INCINVST, INCSUPP, INCOTHER, 
        INCEARN, POVERTY, CBPOVERTY, AGE, FAMUNIT, FAMSIZE, NCHILD, NCHLT5, FTOTINC, BEDROOMS, 
        REPWT,
        CASE 
            WHEN AGE >= 18 THEN NUMPREC - NCHILD 
            ELSE 0 
        END AS nadults,
        CASE 
            WHEN AGE < 18 THEN NUMPREC - nadults 
            ELSE 0 
        END AS nchildren,
        CASE 
            WHEN nadults = 1 AND NCHILD = 0 THEN 'One adult with no children'
            WHEN nadults = 1 AND NCHILD > 0 THEN 'One adult with children'
            WHEN nadults > 1 AND NCHILD = 0 THEN 'More than one adult with no children'
            WHEN nadults > 1 AND NCHILD > 0 THEN 'More than one adult with children'
        END AS household_type_description,
        CASE 
            WHEN AGE < 18 THEN 'Under 18'
            WHEN AGE BETWEEN 18 AND 24 THEN '18-24'
            WHEN AGE BETWEEN 25 AND 44 THEN '25-44'
            WHEN AGE BETWEEN 45 AND 64 THEN '45-64'
            ELSE '65 and over'
        END AS age_group_description
    FROM ipums_data
"""

processed_ipums_data = duckdb_conn.execute(query).fetch_df()

# Save the processed IPUMS data to a CSV file
processed_ipums_data.to_csv('data/processed_ipums_data.csv', index=False)

# Display the processed IPUMS data
processed_ipums_data.head()

# Household Size Adjustments
Calculate household size adjustments and create demographic categorizations using pandas operations.

In [None]:
# Household Size Adjustments

# Calculate household size adjustments and create demographic categorizations

# Define the adjustment factor based on household size
def calculate_hhadj(hhsize):
    if hhsize < 4:
        return 1 - (4 - hhsize) * 0.1
    elif hhsize > 4:
        return 1 + (hhsize - 4) * 0.08
    else:
        return 1

# Apply the adjustment factor to the processed IPUMS data
processed_ipums_data['hhadj'] = processed_ipums_data['NUMPREC'].apply(calculate_hhadj)

# Merge the ACS median family income data with the processed IPUMS data
merged_data = pd.merge(
    processed_ipums_data,
    acs_df[['year', 'median_family_income']],
    left_on='YEAR',
    right_on='year',
    how='left'
)

# Calculate the adjusted median family income for each household
merged_data['mfi_hh'] = merged_data['median_family_income'] * merged_data['hhadj']

# Calculate the percentage of median family income for each household
merged_data['pct_mfi'] = merged_data['HHINCOME'] / merged_data['mfi_hh']

# Define the AMI group based on the percentage of median family income
ami_bins = [0, 0.3, 0.5, 0.8, 1, float('inf')]
ami_labels = ['0-30', '30-50', '50-80', '80-100', '100+']
merged_data['ami_group'] = pd.cut(merged_data['pct_mfi'], bins=ami_bins, labels=ami_labels, right=False)

# Define the AMI percentile based on the percentage of median family income
merged_data['ami_percentile'] = np.ceil(merged_data['pct_mfi'] * 10) / 10

# Calculate housing cost as a percentage of income
merged_data['housing_cost_pct'] = np.where(
    merged_data['OWNERSHP'] == 2,
    merged_data['RENT'] * 12 / merged_data['HHINCOME'],
    merged_data['OWNCOST'] * 12 / merged_data['HHINCOME']
)

# Determine cost burdened status
merged_data['cost_burdened_30'] = merged_data['housing_cost_pct'] >= 0.3
merged_data['cost_burdened_50'] = merged_data['housing_cost_pct'] >= 0.5

# Save the adjusted data to a CSV file
merged_data.to_csv('data/adjusted_ipums_data.csv', index=False)

# Display the adjusted data
merged_data.head()

# AMI Calculations
Calculate AMI percentiles, cost burden metrics, and household classifications using numpy and pandas.

In [None]:
# Calculate AMI percentiles, cost burden metrics, and household classifications

# Define the adjustment factor based on household size
def calculate_hhadj(hhsize):
    if hhsize < 4:
        return 1 - (4 - hhsize) * 0.1
    elif hhsize > 4:
        return 1 + (hhsize - 4) * 0.08
    else:
        return 1

# Apply the adjustment factor to the processed IPUMS data
processed_ipums_data['hhadj'] = processed_ipums_data['NUMPREC'].apply(calculate_hhadj)

# Merge the ACS median family income data with the processed IPUMS data
merged_data = pd.merge(
    processed_ipums_data,
    acs_df[['year', 'median_family_income']],
    left_on='YEAR',
    right_on='year',
    how='left'
)

# Calculate the adjusted median family income for each household
merged_data['mfi_hh'] = merged_data['median_family_income'] * merged_data['hhadj']

# Calculate the percentage of median family income for each household
merged_data['pct_mfi'] = merged_data['HHINCOME'] / merged_data['mfi_hh']

# Define the AMI group based on the percentage of median family income
ami_bins = [0, 0.3, 0.5, 0.8, 1, float('inf')]
ami_labels = ['0-30', '30-50', '50-80', '80-100', '100+']
merged_data['ami_group'] = pd.cut(merged_data['pct_mfi'], bins=ami_bins, labels=ami_labels, right=False)

# Define the AMI percentile based on the percentage of median family income
merged_data['ami_percentile'] = np.ceil(merged_data['pct_mfi'] * 10) / 10

# Calculate housing cost as a percentage of income
merged_data['housing_cost_pct'] = np.where(
    merged_data['OWNERSHP'] == 2,
    merged_data['RENT'] * 12 / merged_data['HHINCOME'],
    merged_data['OWNCOST'] * 12 / merged_data['HHINCOME']
)

# Determine cost burdened status
merged_data['cost_burdened_30'] = merged_data['housing_cost_pct'] >= 0.3
merged_data['cost_burdened_50'] = merged_data['housing_cost_pct'] >= 0.5

# Save the adjusted data to a CSV file
merged_data.to_csv('data/adjusted_ipums_data.csv', index=False)

# Display the adjusted data
merged_data.head()

# Data Export
Export processed data to CSV files in various formats for further analysis.

In [None]:
# Data Export

# Export processed data to CSV files in various formats for further analysis

# Save the AMI percentiles by year to a CSV file
ami_percentile_by_year.to_csv('data/ami_percentile_by_year_long.csv', index=False)

# Save the processed IPUMS data to a CSV file
processed_ipums_data.to_csv('data/processed_ipums_data.csv', index=False)

# Save the adjusted IPUMS data to a CSV file
merged_data.to_csv('data/adjusted_ipums_data.csv', index=False)

# Convert data to Eviews friendly format
ami_percentile_by_year_eviews = ami_percentile_by_year.pivot_table(
    index='YEAR', 
    columns=['ami_group', 'tenure'], 
    values='total_hh', 
    aggfunc='sum'
).reset_index()

# Save the Eviews friendly format data to a CSV file
ami_percentile_by_year_eviews.to_csv('data/ami_percentile_by_year_eviews.csv', index=False)

# Display the first few rows of each exported DataFrame
print("AMI Percentile by Year (Long Format):")
print(ami_percentile_by_year.head())

print("\nProcessed IPUMS Data:")
print(processed_ipums_data.head())

print("\nAdjusted IPUMS Data:")
print(merged_data.head())

print("\nAMI Percentile by Year (Eviews Format):")
print(ami_percentile_by_year_eviews.head())