In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from math import sqrt
from sklearn.metrics import mean_squared_error

In [2]:
# Read only total_tract_population column from acs_data.csv
tract_pop = pd.read_csv(
    "../data/acs_data.csv",
    usecols=["tract", "total_tract_population"],
    dtype={"total_tract_population": np.float32, "tract": np.int32},
)
tract_pop.columns

Index(['tract', 'total_tract_population'], dtype='object')

# Run Regression 

In [3]:
# Load the data
dtype_dict = {
    "B01003_001E_adj_supply_any_avg_tract": np.float32,
    "B01003_001E_adj_supply_hr_avg_tract": np.float32,
    "B01003_001E_totcost_any_avg_tract": np.float32,
    "state": np.int32,
    "county": np.int32,
    "tract": np.int32,
    "median_household_income": np.float32,
    "gini_index": np.float32,
    "educational_attainment": np.float32,
    "employment_status": np.float32,
    "housing_tenure": np.float32,
    "poverty_status": np.float32,
    "commute_time": np.float32,
    "without_health_insurance": np.float32,
    "white": np.float32,
    "black": np.float32,
    "am_indian_alaska_native": np.float32,
    "asian": np.float32,
    "other_race": np.float32,
    "hispanic_latino": np.float32,
    "density": np.float32,
}


data = pd.read_csv(
    "../data/acs_data_all.csv", dtype=dtype_dict
)  # Replace with the path to your data file

# Merge tract_pop to data
data = data.merge(
    tract_pop, how="left", on= "tract", validate="1:1")

# Scale median_household_income by 10,000
data["median_household_income"] = data["median_household_income"] / 10000

# Rename the variables
data.rename(
    columns={
        "B01003_001E_adj_supply_any_avg_tract": "Average Supply of Any Provider",
        "B01003_001E_adj_supply_hr_avg_tract": "Average Supply of HR Provider",
        "B01003_001E_totcost_any_avg_tract": "Average Total Cost of Any Provider",
        "median_household_income": "Median Household Income",
        "gini_index": "Gini Index",
        "educational_attainment": "Educational Attainment",
        "employment_status": "Employment Status",
        "housing_tenure": "Housing Tenure",
        "poverty_status": "Poverty Status",
        "commute_time": "Commute Time",
        "without_health_insurance": "Without Health Insurance",
        "white": "Share of White Population",
        "black": "Share of Black Population",
        "am_indian_alaska_native": "Share of American Indian/Alaska Native Population",
        "asian": "Share of Asian Population",
        "other_race": "Share of Other Race Population",
        "hispanic_latino": "Share of Hispanic/Latino Population",
        "density": "Population Density",
        "tract_pop": "total_tract_population",
    },
    inplace=True,
)

# calculate state population
state_population = data.groupby("state")["total_tract_population"].transform("sum")

# create tract_weight
data["tract_weight"] = data["total_tract_population"] / state_population

In [4]:
data.head()

Unnamed: 0,Average Supply of Any Provider,Average Supply of HR Provider,Average Total Cost of Any Provider,state,county,tract,Median Household Income,Gini Index,Educational Attainment,Employment Status,...,Without Health Insurance,Share of White Population,Share of Black Population,Share of American Indian/Alaska Native Population,Share of Asian Population,Share of Other Race Population,Share of Hispanic/Latino Population,Population Density,total_tract_population,tract_weight
0,0.024394,0.00703,140.768204,27,27001,1231966324,46548.0,0.4291,0.142012,0.45598,...,0.0,0.000458,4.2e-05,0.000741,6.338997e-08,3.097119e-08,0.00015,2.178822,2240.0,0.000404
1,0.277115,0.0,49.739838,27,27001,1231966424,48580.0,0.3815,0.084266,0.442195,...,0.008319,0.000446,2e-05,0.001586,4.847469e-09,4.922516e-08,6.7e-05,2.262976,2284.0,0.000411
2,0.48175,0.081783,147.445908,27,27001,1231966524,44543.0,0.4515,0.155082,0.498758,...,0.006657,0.000719,7.9e-05,0.000672,4.847469e-08,2.739537e-09,0.00024,44.62014,3483.0,0.000627
3,0.420769,0.028768,91.856232,27,27001,1231966624,48708.0,0.4339,0.121503,0.485145,...,0.006061,0.000593,7e-05,0.002534,0.0,2.518452e-09,0.000134,2.945787,2970.0,0.000535
4,0.009878,0.0,59.393177,27,27001,1231986725,50110.0,0.4564,0.087353,0.438879,...,0.00105,0.000406,1.1e-05,0.0,5.220351e-08,1.345738e-10,3.3e-05,2.94782,1905.0,0.000343


In [5]:
def run_regression_analysis(covariates, outcome_variables, data, scalar=1):
    regression_results = {}
    rmse_results = {}

    for outcome in outcome_variables:
        X = sm.add_constant(data[covariates])
        y = data[outcome] * scalar

        model = sm.OLS(y, X)
        result = model.fit()

        rmse = np.sqrt(mean_squared_error(y, result.predict()))

        regression_results[outcome] = result
        rmse_results[outcome] = rmse

    return regression_results, rmse_results

### Access to Any Provider

In [None]:
# Define the covariates
covariates = other_covariates  # This is a list of covariates

# Define the outcome variables
outcome_variables = (
    race_variables1  # racial_groups   This is a list of outcome variables
)

# Define the scalar
scalar = 100  # Replace with your desired scalar

# Call the function to run the regression analysis
regression_results, rmse_results = run_regression_analysis(
    covariates, outcome_variables, data, scalar
)

# Generate the summary table
summary_table = summary_col(
    list(regression_results.values()),
    stars=True,
    float_format="%.3f",
    model_names=outcome_variables,
    info_dict={"N": lambda x: f"{x.nobs:.0f}", "R2": lambda x: f"{x.rsquared_adj:.3f}"},
)

# Convert RMSE results to a DataFrame and concatenate with summary table
rmse_df = pd.DataFrame(rmse_results, index=["RMSE"])
summary_df1 = pd.concat([summary_table.tables[0], rmse_df])