# Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, calinski_harabasz_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint
from sklearn.exceptions import FitFailedWarning
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import f_regression


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# pd.reset_option("all")


import warnings
warnings.filterwarnings("ignore", category=FitFailedWarning)

from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")


import statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Importing Datasets

In [261]:
df = pd.read_csv('data/chr_census_2010_2024.csv', index_col=0, dtype={"geo_code": str, "year": int})
df.head()

Unnamed: 0,geo_code,year,geo_full_name,geo_name,state,state_code,access_to_exercise_opportunities,adult_obesity,adult_smoking,air_pollution_particulate_matter,alcohol_impaired_driving_deaths,children_eligible_for_free_lunch,children_in_poverty,children_in_single_parent_households,college_completion,dentists,diabetes_prevalence,drinking_water_violations,driving_alone_to_work,excessive_drinking,food_environment_index,food_insecurity,frequent_mental_distress,gender_pay_gap,gini_index,high_school_completion,high_school_graduation,hiv_prevalence,homeownership,income_inequality,injury_deaths,insufficient_sleep,life_expectancy,limited_access_to_healthy_foods,long_commute_driving_alone,low_birthweight,mammography_screening,median_age,median_household_income,mental_health_providers,motor_vehicle_crash_deaths,other_pcp,pcp,pct_65_and_older,pct_aapi,pct_agriculture_hunting_and_mining,pct_arts_entertainment_and_accommodation_services,pct_asian,pct_construction,pct_educational_and_health_services,pct_female,pct_finance_insurance_and_real_estate,pct_hispanic,pct_households_with_high_housing_costs,pct_households_with_lack_of_kitchen_or_plumbing_facilities,pct_households_with_overcrowding,pct_indigenous,pct_information,pct_manufacturing,pct_mgmt_bus_sci_art_occupations,pct_nat_res_constr_maint_occupations,pct_nhpi,pct_non_hispanic_black,pct_non_hispanic_white,pct_not_proficient_in_english,pct_other_services,pct_prod_transp_mat_moving_occupations,pct_professional_management_and_scientific_services,pct_public_administration,pct_retail_trade,pct_rural,pct_sales_and_office_occupations,pct_service_occupations,pct_transportation_and_utilities,pct_under_18,pct_wholesale_trade,physical_inactivity,poor_mental_health_days,poor_or_fair_health,poor_physical_health_days,population,poverty,premature_age_adjusted_mortality,premature_death,preventable_hospital_stays,ratio_of_pop_to_dentists,ratio_of_pop_to_mental_health,ratio_of_pop_to_pcp,ratio_of_pop_to_pcp_other_than_physicians,severe_housing_cost_burden,severe_housing_problems,sexually_transmitted_infections,single_parent_households,social_associations,some_college,teen_births,unemployment_rate,uninsured_adults,violent_crime
0,1001,2010,"Autauga County, Alabama",Autauga,Alabama,AL,0.6907,0.3,0.2814,12.57,0.284338,0.161946,0.138,0.2954,0.217477,0.000262,0.11818,0.0,0.868566,0.15404,7.219912,0.13821,0.135139,66.048439,0.405,0.853323,0.71519,184.736484,0.77518,3.930128,72.504788,0.369589,75.607833,0.152482,31.206695,0.0883,0.658644,36.2,53255.0,5e-05,28.8,0.000248,52.224775,0.114006,0.006547,0.009449,0.086767,0.006547,0.0774,0.174315,0.515003,0.065322,0.023196,0.14426,0.009737,0.002029,0.002897,0.011257,0.141284,0.293291,0.11569,0.0,0.178215,0.775524,0.009389,0.04552,0.131096,0.069964,0.116676,0.126659,0.441005,0.290703,0.169221,0.044246,0.27702,0.031141,0.3218,4.14,0.2586,5.48,53155.0,0.214724,441.808519,9778.1,91.803383,4120.317674,37113.10337,1914.8,5074.504751,0.14426,0.096612,370.0,0.179785,13.455842,0.501062,47.513321,0.062076,0.14,256.487012
1,1003,2010,"Baldwin County, Alabama",Baldwin,Alabama,AL,0.6523,0.245,0.2309,11.5998,0.321341,0.20436,0.1933,0.2852,0.268221,0.000464,0.10766,0.253189,0.835134,0.190255,7.651281,0.13214,0.136337,68.766137,0.439,0.875757,0.670437,177.725095,0.767301,4.21444,74.28336,0.335911,77.707111,0.037806,33.724273,0.0877,0.688273,41.0,50147.0,0.000326,23.6,0.000399,70.74137,0.164912,0.00657,0.018619,0.095237,0.006536,0.107106,0.179215,0.51134,0.068492,0.038955,0.174773,0.011083,0.004721,0.006064,0.01732,0.092078,0.315219,0.134832,3.4e-05,0.093793,0.840777,0.022074,0.0493,0.105578,0.098688,0.04595,0.137723,0.500277,0.268785,0.175586,0.05354,0.233084,0.036729,0.25185,4.06,0.1338,3.57,175791.0,0.130669,366.591291,8221.7,65.391299,2177.442032,4917.923172,1413.6,2746.75516,0.174773,0.116501,245.9,0.138206,11.480511,0.57681,31.854171,0.065605,0.209,171.623233
2,1005,2010,"Barbour County, Alabama",Barbour,Alabama,AL,0.3645,0.364,0.2268,12.0202,0.404637,0.362199,0.341,0.5196,0.135124,0.000361,0.15103,0.187097,0.789877,0.110813,5.436531,0.22355,0.161515,74.681632,0.464,0.718629,0.563422,382.090476,0.680449,5.835112,66.512793,0.39882,75.719796,0.094833,26.762889,0.11,0.61296,38.0,33219.0,7.6e-05,30.4,0.000335,40.744262,0.137406,0.001769,0.040973,0.058432,0.001661,0.063478,0.150066,0.471028,0.037945,0.045958,0.171885,0.013783,0.003675,0.003141,0.008174,0.282571,0.265819,0.103441,0.000108,0.464674,0.472111,0.021339,0.03956,0.260773,0.039156,0.062872,0.124836,0.695329,0.213442,0.156524,0.079221,0.224918,0.012716,0.33396,3.84,0.2442,6.11,27699.0,0.213405,485.936754,10686.1,92.297198,2771.724944,16797.22333,2454.333333,4745.385553,0.171885,0.15804,585.7,0.225625,8.042733,0.36485,87.486157,0.095894,0.151,64.00531
3,1007,2010,"Bibb County, Alabama",Bibb,Alabama,AL,0.3021,0.317,0.2746,11.6488,0.275008,0.161418,0.242,0.3549,0.100252,0.000211,0.12091,0.0,0.849911,0.138328,7.649525,0.15697,0.148993,69.83018,0.411,0.745458,0.603834,142.813645,0.82919,4.332317,94.378707,0.393172,73.576596,0.133676,43.250507,0.0911,0.59406,38.3,41770.0,7e-05,41.3,0.000641,42.105263,0.12437,0.001592,0.038191,0.039312,0.001592,0.106426,0.137387,0.462096,0.057236,0.005927,0.116541,0.001881,0.0,0.001769,0.008046,0.180059,0.200835,0.18566,0.0,0.152587,0.83609,0.008245,0.082595,0.252572,0.046746,0.039923,0.137081,0.805111,0.22874,0.132193,0.104288,0.234321,0.022711,0.36869,5.31,0.1817,4.22,22610.0,0.213129,549.21009,13069.5,96.794256,4788.351167,-5432.008789,2375.0,3696.29563,0.116541,0.066255,284.0,0.187878,9.764098,0.322305,30.716724,0.091338,0.177,127.580788
4,1009,2010,"Blount County, Alabama",Blount,Alabama,AL,0.2168,0.315,0.2342,12.5406,0.190252,0.201278,0.185,0.253,0.125272,0.000152,0.12406,0.031967,0.806364,0.055292,8.4289,0.12069,0.146638,73.733161,0.422,0.746593,0.730135,75.734817,0.820044,4.310364,93.838475,0.368693,75.623747,0.125429,62.551583,0.0726,0.610489,38.3,45549.0,2.5e-05,28.6,0.000131,19.987281,0.142066,0.004339,0.031733,0.064802,0.004339,0.109973,0.17518,0.504692,0.046264,0.074896,0.152476,0.007959,0.005047,0.003845,0.014248,0.167813,0.24658,0.157573,0.0,0.011554,0.894959,0.042746,0.058609,0.179268,0.066947,0.040759,0.11568,0.907806,0.256294,0.160285,0.069619,0.247795,0.038371,0.34918,4.47,0.2493,5.62,56692.0,0.220741,443.943605,8929.7,102.021063,7943.818894,52884.508833,5003.181818,10954.451751,0.152476,0.068867,85.1,0.142587,8.418282,0.382355,36.640361,0.07527,0.208,93.782854


In [262]:
df.columns.tolist()

['geo_code',
 'year',
 'geo_full_name',
 'geo_name',
 'state',
 'state_code',
 'access_to_exercise_opportunities',
 'adult_obesity',
 'adult_smoking',
 'air_pollution_particulate_matter',
 'alcohol_impaired_driving_deaths',
 'children_eligible_for_free_lunch',
 'children_in_poverty',
 'children_in_single_parent_households',
 'college_completion',
 'dentists',
 'diabetes_prevalence',
 'drinking_water_violations',
 'driving_alone_to_work',
 'excessive_drinking',
 'food_environment_index',
 'food_insecurity',
 'frequent_mental_distress',
 'gender_pay_gap',
 'gini_index',
 'high_school_completion',
 'high_school_graduation',
 'hiv_prevalence',
 'homeownership',
 'income_inequality',
 'injury_deaths',
 'insufficient_sleep',
 'life_expectancy',
 'limited_access_to_healthy_foods',
 'long_commute_driving_alone',
 'low_birthweight',
 'mammography_screening',
 'median_age',
 'median_household_income',
 'mental_health_providers',
 'motor_vehicle_crash_deaths',
 'other_pcp',
 'pcp',
 'pct_65_and_o

In [263]:
# df = df[~df.year.isin([2023, 2024])]

# Geographic Categories

In [264]:
df['state_fips'] = df['geo_code'].apply(lambda x: str(x)[:2])
df.head()

Unnamed: 0,geo_code,year,geo_full_name,geo_name,state,state_code,access_to_exercise_opportunities,adult_obesity,adult_smoking,air_pollution_particulate_matter,alcohol_impaired_driving_deaths,children_eligible_for_free_lunch,children_in_poverty,children_in_single_parent_households,college_completion,dentists,diabetes_prevalence,drinking_water_violations,driving_alone_to_work,excessive_drinking,food_environment_index,food_insecurity,frequent_mental_distress,gender_pay_gap,gini_index,high_school_completion,high_school_graduation,hiv_prevalence,homeownership,income_inequality,injury_deaths,insufficient_sleep,life_expectancy,limited_access_to_healthy_foods,long_commute_driving_alone,low_birthweight,mammography_screening,median_age,median_household_income,mental_health_providers,motor_vehicle_crash_deaths,other_pcp,pcp,pct_65_and_older,pct_aapi,pct_agriculture_hunting_and_mining,pct_arts_entertainment_and_accommodation_services,pct_asian,pct_construction,pct_educational_and_health_services,pct_female,pct_finance_insurance_and_real_estate,pct_hispanic,pct_households_with_high_housing_costs,pct_households_with_lack_of_kitchen_or_plumbing_facilities,pct_households_with_overcrowding,pct_indigenous,pct_information,pct_manufacturing,pct_mgmt_bus_sci_art_occupations,pct_nat_res_constr_maint_occupations,pct_nhpi,pct_non_hispanic_black,pct_non_hispanic_white,pct_not_proficient_in_english,pct_other_services,pct_prod_transp_mat_moving_occupations,pct_professional_management_and_scientific_services,pct_public_administration,pct_retail_trade,pct_rural,pct_sales_and_office_occupations,pct_service_occupations,pct_transportation_and_utilities,pct_under_18,pct_wholesale_trade,physical_inactivity,poor_mental_health_days,poor_or_fair_health,poor_physical_health_days,population,poverty,premature_age_adjusted_mortality,premature_death,preventable_hospital_stays,ratio_of_pop_to_dentists,ratio_of_pop_to_mental_health,ratio_of_pop_to_pcp,ratio_of_pop_to_pcp_other_than_physicians,severe_housing_cost_burden,severe_housing_problems,sexually_transmitted_infections,single_parent_households,social_associations,some_college,teen_births,unemployment_rate,uninsured_adults,violent_crime,state_fips
0,1001,2010,"Autauga County, Alabama",Autauga,Alabama,AL,0.6907,0.3,0.2814,12.57,0.284338,0.161946,0.138,0.2954,0.217477,0.000262,0.11818,0.0,0.868566,0.15404,7.219912,0.13821,0.135139,66.048439,0.405,0.853323,0.71519,184.736484,0.77518,3.930128,72.504788,0.369589,75.607833,0.152482,31.206695,0.0883,0.658644,36.2,53255.0,5e-05,28.8,0.000248,52.224775,0.114006,0.006547,0.009449,0.086767,0.006547,0.0774,0.174315,0.515003,0.065322,0.023196,0.14426,0.009737,0.002029,0.002897,0.011257,0.141284,0.293291,0.11569,0.0,0.178215,0.775524,0.009389,0.04552,0.131096,0.069964,0.116676,0.126659,0.441005,0.290703,0.169221,0.044246,0.27702,0.031141,0.3218,4.14,0.2586,5.48,53155.0,0.214724,441.808519,9778.1,91.803383,4120.317674,37113.10337,1914.8,5074.504751,0.14426,0.096612,370.0,0.179785,13.455842,0.501062,47.513321,0.062076,0.14,256.487012,1
1,1003,2010,"Baldwin County, Alabama",Baldwin,Alabama,AL,0.6523,0.245,0.2309,11.5998,0.321341,0.20436,0.1933,0.2852,0.268221,0.000464,0.10766,0.253189,0.835134,0.190255,7.651281,0.13214,0.136337,68.766137,0.439,0.875757,0.670437,177.725095,0.767301,4.21444,74.28336,0.335911,77.707111,0.037806,33.724273,0.0877,0.688273,41.0,50147.0,0.000326,23.6,0.000399,70.74137,0.164912,0.00657,0.018619,0.095237,0.006536,0.107106,0.179215,0.51134,0.068492,0.038955,0.174773,0.011083,0.004721,0.006064,0.01732,0.092078,0.315219,0.134832,3.4e-05,0.093793,0.840777,0.022074,0.0493,0.105578,0.098688,0.04595,0.137723,0.500277,0.268785,0.175586,0.05354,0.233084,0.036729,0.25185,4.06,0.1338,3.57,175791.0,0.130669,366.591291,8221.7,65.391299,2177.442032,4917.923172,1413.6,2746.75516,0.174773,0.116501,245.9,0.138206,11.480511,0.57681,31.854171,0.065605,0.209,171.623233,1
2,1005,2010,"Barbour County, Alabama",Barbour,Alabama,AL,0.3645,0.364,0.2268,12.0202,0.404637,0.362199,0.341,0.5196,0.135124,0.000361,0.15103,0.187097,0.789877,0.110813,5.436531,0.22355,0.161515,74.681632,0.464,0.718629,0.563422,382.090476,0.680449,5.835112,66.512793,0.39882,75.719796,0.094833,26.762889,0.11,0.61296,38.0,33219.0,7.6e-05,30.4,0.000335,40.744262,0.137406,0.001769,0.040973,0.058432,0.001661,0.063478,0.150066,0.471028,0.037945,0.045958,0.171885,0.013783,0.003675,0.003141,0.008174,0.282571,0.265819,0.103441,0.000108,0.464674,0.472111,0.021339,0.03956,0.260773,0.039156,0.062872,0.124836,0.695329,0.213442,0.156524,0.079221,0.224918,0.012716,0.33396,3.84,0.2442,6.11,27699.0,0.213405,485.936754,10686.1,92.297198,2771.724944,16797.22333,2454.333333,4745.385553,0.171885,0.15804,585.7,0.225625,8.042733,0.36485,87.486157,0.095894,0.151,64.00531,1
3,1007,2010,"Bibb County, Alabama",Bibb,Alabama,AL,0.3021,0.317,0.2746,11.6488,0.275008,0.161418,0.242,0.3549,0.100252,0.000211,0.12091,0.0,0.849911,0.138328,7.649525,0.15697,0.148993,69.83018,0.411,0.745458,0.603834,142.813645,0.82919,4.332317,94.378707,0.393172,73.576596,0.133676,43.250507,0.0911,0.59406,38.3,41770.0,7e-05,41.3,0.000641,42.105263,0.12437,0.001592,0.038191,0.039312,0.001592,0.106426,0.137387,0.462096,0.057236,0.005927,0.116541,0.001881,0.0,0.001769,0.008046,0.180059,0.200835,0.18566,0.0,0.152587,0.83609,0.008245,0.082595,0.252572,0.046746,0.039923,0.137081,0.805111,0.22874,0.132193,0.104288,0.234321,0.022711,0.36869,5.31,0.1817,4.22,22610.0,0.213129,549.21009,13069.5,96.794256,4788.351167,-5432.008789,2375.0,3696.29563,0.116541,0.066255,284.0,0.187878,9.764098,0.322305,30.716724,0.091338,0.177,127.580788,1
4,1009,2010,"Blount County, Alabama",Blount,Alabama,AL,0.2168,0.315,0.2342,12.5406,0.190252,0.201278,0.185,0.253,0.125272,0.000152,0.12406,0.031967,0.806364,0.055292,8.4289,0.12069,0.146638,73.733161,0.422,0.746593,0.730135,75.734817,0.820044,4.310364,93.838475,0.368693,75.623747,0.125429,62.551583,0.0726,0.610489,38.3,45549.0,2.5e-05,28.6,0.000131,19.987281,0.142066,0.004339,0.031733,0.064802,0.004339,0.109973,0.17518,0.504692,0.046264,0.074896,0.152476,0.007959,0.005047,0.003845,0.014248,0.167813,0.24658,0.157573,0.0,0.011554,0.894959,0.042746,0.058609,0.179268,0.066947,0.040759,0.11568,0.907806,0.256294,0.160285,0.069619,0.247795,0.038371,0.34918,4.47,0.2493,5.62,56692.0,0.220741,443.943605,8929.7,102.021063,7943.818894,52884.508833,5003.181818,10954.451751,0.152476,0.068867,85.1,0.142587,8.418282,0.382355,36.640361,0.07527,0.208,93.782854,1


In [265]:
df['region'] = np.where(
    df['state_fips'].isin(["09", "23", "25", "33", "44", "50", "34", "36", "42"]), "Northeast", np.where(
        df['state_fips'].isin(["17", "18", "26", "39", "55", "19", "20", "27", "29", "31", "38", "46"]), "Midwest", np.where(
            df['state_fips'].isin(["10", "12", "13", "24", "37", "45", "51", "11", "54", "01", "21", "28", "47", "05", "22", "40", "48"]), "South", np.where(
                df['state_fips'].isin(["04", "08", "16", "30", "32", "35", "49", "56", "02", "06", "15", "41", "53"]), "West", np.nan
            )
        )
    )
)

In [266]:
df['division'] = np.where(
    df['state_fips'].isin(["09", "23", "25", "33", "44", "50"]), "New England", np.where(
        df['state_fips'].isin(["34", "36", "42"]), "Middle Atlantic", np.where(
            df['state_fips'].isin(["17", "18", "26", "39", "55"]), "East North Central", np.where(
                df['state_fips'].isin(["19", "20", "27", "29", "31", "38", "46"]), "West North Central", np.where(
                    df['state_fips'].isin(["10", "12", "13", "24", "37", "45", "51", "11", "54"]), "South Atlantic", np.where(
                        df['state_fips'].isin(["01", "21", "28", "47"]), "East South Central", np.where(
                            df['state_fips'].isin(["05", "22", "40", "48"]), "West South Central", np.where(
                                df['state_fips'].isin(["04", "08", "16", "30", "32", "35", "49", "56"]), "Mountain", np.where(
                                    df['state_fips'].isin(["02", "06", "15", "41", "53"]), "Pacific", np.nan
                                )
                            )
                        )
                    )
                )
            )
        )
    )
)

In [267]:
df['pop_quantiles_by_year'] = df.groupby('year')['population'].transform(lambda x: pd.qcut(x, 5, labels=False))
df['pop_quantiles_by_year'] = df['pop_quantiles_by_year'].astype(str)

df['income_quantiles_by_year'] = df.groupby('year')['median_household_income'].transform(lambda x: pd.qcut(x, 5, labels=False))
df['income_quantiles_by_year'] = df['income_quantiles_by_year'].astype(str)

# Initial VIF

In [239]:
numeric_data = df.select_dtypes(include=[np.number])
X = numeric_data.drop("life_expectancy", axis=1)  # Drop the target variable if it’s in the DataFrame
X = StandardScaler().fit_transform(X)  # Standardize features

vif_data = pd.DataFrame()
vif_data["feature"] = numeric_data.columns.drop("life_expectancy")
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Display the VIF values
pd.DataFrame(vif_data).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
53,pct_mgmt_bus_sci_art_occupations,9007199000000000.0
60,pct_prod_transp_mat_moving_occupations,9007199000000000.0
39,pct_agriculture_hunting_and_mining,4503600000000000.0
52,pct_manufacturing,4503600000000000.0
54,pct_nat_res_constr_maint_occupations,4503600000000000.0
66,pct_service_occupations,3002400000000000.0
65,pct_sales_and_office_occupations,2251800000000000.0
43,pct_educational_and_health_services,2251800000000000.0
40,pct_arts_entertainment_and_accommodation_services,1501200000000000.0
61,pct_professional_management_and_scientific_services,1286743000000000.0


# Correlations

In [240]:
numeric_data = df.select_dtypes(include=[np.number])

# Calculating the correlation matrix between life expectancy and all other numeric variables
correlation_matrix = numeric_data.corr()

# Selecting correlations with life expectancy and sorting them
life_expectancy_corr = correlation_matrix["life_expectancy"].sort_values(
    ascending=False
)

# Display the top correlations (both positive and negative) with life expectancy
top_positive_corr = life_expectancy_corr.head(15)
top_negative_corr = life_expectancy_corr.tail(15)

pd.DataFrame(top_positive_corr)

Unnamed: 0,life_expectancy
life_expectancy,1.0
some_college,0.609613
college_completion,0.593858
median_household_income,0.593027
high_school_completion,0.509111
pct_mgmt_bus_sci_art_occupations,0.473965
excessive_drinking,0.4487
food_environment_index,0.447388
access_to_exercise_opportunities,0.339289
pct_asian,0.326327


In [241]:
pd.DataFrame(top_negative_corr)

Unnamed: 0,life_expectancy
low_birthweight,-0.505237
children_eligible_for_free_lunch,-0.529102
adult_obesity,-0.549792
insufficient_sleep,-0.570642
poverty,-0.576057
adult_smoking,-0.576257
poor_physical_health_days,-0.601806
diabetes_prevalence,-0.605657
children_in_poverty,-0.610279
food_insecurity,-0.616252


# Eliminating Redundant Features

- ```'children_eligible_for_free_lunch'``` and ```'children_in_poverty'``` are essentially the same since children in poverty are the very ones that would be eligible for free lunch; there is strong overlap with ```'children_in_single_parent_households'```, but is not synonymous with them.
- ```'gini_index'``` and ```'income_inequality'``` are essentially the same also
- ```'single_parent_households'``` and ```'children_in_single_parent_households'``` seem redundant, but would like input on that matter
- eliminate ```'poverty'``` as it is redundant with ```'children_in_poverty'```
- ```'pct_aapi'``` shoudl be removed since it is just an amalgam of ```'pct_asian'``` and ```'pct_nhpi'```
-  ```'high_school_completion'``` and ```'high_school_graduation'```, as well as ```'some_college'``` and ```'college_completion'```
-  feature enginneer industry categories:
    - ```'pct_agriculture_hunting_and_mining'```, ```'pct_arts_entertainment_and_accommodation_services'```, ```'pct_construction'```, ```'pct_educational_and_health_services'```, ```'pct_finance_insurance_and_real_estate'```, ```'pct_information'```,```'pct_manufacturing'```, ```'pct_public_administration'```, ```'pct_retail_trade'```, ```'pct_other_services'```, ```'pct_transportation_and_utilities'```, ```'pct_wholesale_trade'```, ```'pct_professional_management_and_scientific_services'```,
-  remove occupation categories (having both would be redundant):
    -  ```'pct_mgmt_bus_sci_art_occupations', 'pct_nat_res_constr_maint_occupations',  'pct_prod_transp_mat_moving_occupations', 'pct_sales_and_office_occupations', 'pct_service_occupations'```
- ```'severe_housing_problems'``` is a composite score of the following three:
```'pct_households_with_high_housing_costs','pct_households_with_lack_of_kitchen_or_plumbing_facilities','pct_households_with_overcrowding'```, so we keep one or the other



- Use the CHR website definitions in deciding whether to eliminate redundant features or combining features:
    - remaining Health Behaviors: ```'access_to_exercise_opportunities', 'adult_obesity', 'physical_inactivity', 'adult_smoking', 'alcohol_impaired_driving_deaths', 'excessive_drinking', 'food_environment_index', 'food_insecurity', 'limited_access_to_healthy_foods', 'insufficient_sleep',  'sexually_transmitted_infections', 'teen_births'```, here we have the food and inactivity/obesity features
        - ```'food_insecurity'``` and ```'limited_access_to_healthy_foods'``` are conceptually very similar, perhaps combining them would be best
        - ```'access_to_exercise_opportunities'``` and  ```'physical_inactivity'``` are conceptually similar also, where they are reporting two sides of the coin, where one is reporting no leisure-time physical inactivity and the other access to locations for physical activity
        - we could find the average of the ```'access_to_exercise_opportunities'``` and ```1 - 'physical_inactivity'```
    - Demographics: ```'pct_65_and_older', 'pct_asian', 'pct_female', 'pct_hispanic', 'pct_indigenous', 'pct_non_hispanic_black', 'pct_non_hispanic_white', 'pct_not_proficient_in_english', 'pct_rural', 'pct_under_18'```, we should drop one of the race categories at least and/or form a diversity index
        - keeping individual racial categories will allow us to tie one of the groups to greater life expectancy (we know ```pct_asian``` is more correlated with such counties, we would expect ```pct_white``` to be correlated due to greater wealth amongst those populations)
        - just like we don't have ```pct_urban``` and ```pct_rural``` and ```pct_male``` and ```pct_female```, keeping all the racial categories would create collinearity
    - Social and Economic Factors: ```'children_in_poverty', 'college_completion', 'gender_pay_gap', 'gini_index', 'high_school_completion', 'injury_deaths', 'median_household_income', 'severe_housing_cost_burden', 'single_parent_households', 'social_associations', 'unemployment_rate', 'violent_crime', 'motor_vehicle_crash_deaths```
    - Clinical Care:  ```'dentists', 'mammography_screening', 'mental_health_providers', 'other_pcp', 'pcp', 'preventable_hospital_stays','ratio_of_pop_to_dentists', 'ratio_of_pop_to_mental_health', 'ratio_of_pop_to_pcp', 'ratio_of_pop_to_pcp_other_than_physicians', 'uninsured_adults'``` - we can keep ```'ratio_of_pop...'``` and drop the following:  ```'dentists', 'pcp', 'mental_health_providers', 'other_pcp'```, as they are synonymous with each other
    - Physical Environment: ```'air_pollution_particulate_matter', 'drinking_water_violations', 'driving_alone_to_work', 'homeownership', 'long_commute_driving_alone', 'severe_housing_problems'```
        - here driving alone and long commute driving alone seems redundant, keep ```"driving_alone_to_work"```
    - Health Outcomes: ```'diabetes_prevalence', 'frequent_mental_distress', 'hiv_prevalence', 'low_birthweight', 'poor_mental_health_days', 'poor_or_fair_health', 'poor_physical_health_days',  'premature_age_adjusted_mortality', 'premature_death', 'life_expectancy'```, here we can combine or eliminate the poor health and the premature mortality/death features
        - I know Christine was highlighting the importance of psychological factors in life expectancy, we should keep ```"poor_mental_health_days"```
        - ```"poor_mental_health_days"``` and ```"poor_physical_health_days"``` are representatives of average number of days respondents reporting poor mental or physical health, while ```"poor_or_fair_health"``` is the percentage of adults reporting fair or poor health, eliminate ```"poor_physical_health_days"``` as combining ```"poor_physical_health_days"```
        - ```'premature_age_adjusted_mortality'``` and ```'premature_death'``` are the same concepts where one represents the average number of years lost and the other represents the number of deaths among residents under age 75 per 100,000 population, keep the latter

- Additional thoughts:
    - there could be strong overlap with adult obesity and diabetes prevalence
    - frequent_mental_distress and poor_mental_health_days could also have strong overlap

- Recent Changes:
    - instead of feature engineering the industry categories, I opted to keep the ones that were the most correlated with life expectancy, i.e. ```'pct_finance_insurance_and_real_estate'``` and ```'pct_professional_management_and_scientific_services'```
    - I also opted to keep the racial categories that are the most correlated with life expectancy:  ```'pct_non_hispanic_white', 'pct_hispanic', 'pct_asian'```
    - Upon closer inspection of the features, ```'ratio_to_pop_to_pcp', 'pcp', 'dentists', 'ratio_to_pop_to_dentists', 'mental_health_providers', 'ratio_of_pop_to_mental_health'``` are the same pairs of variables.















In [307]:
data = df.drop(
    columns=[
        "children_in_single_parent_households",
        "children_eligible_for_free_lunch",
        "poverty",
        "pct_aapi",
        "pct_households_with_high_housing_costs",
        "pct_households_with_lack_of_kitchen_or_plumbing_facilities",
        "pct_households_with_overcrowding",
        "severe_housing_cost_burden",
        "gini_index",
        # "dentists",
        # "mental_health_providers",
        "other_pcp",
        # "pcp",
        "ratio_of_pop_to_dentists",
        "ratio_of_pop_to_mental_health",
        "ratio_of_pop_to_pcp",
        "ratio_of_pop_to_pcp_other_than_physicians",
        "long_commute_driving_alone",
        "pct_mgmt_bus_sci_art_occupations",
        "pct_service_occupations",
        "pct_sales_and_office_occupations",
        "pct_nat_res_constr_maint_occupations",
        "pct_prod_transp_mat_moving_occupations",
        "high_school_graduation",
        "some_college",
        "hiv_prevalence",
        "premature_age_adjusted_mortality",
        "alcohol_impaired_driving_deaths",
        "injury_deaths",
        "high_school_completion",
        "population",
        "median_age"
    ]
)

# Feature Engineering

## Industry Categories

In [269]:
# data['blue_collar_industries'] = data['pct_agriculture_hunting_and_mining'] + data['pct_construction'] + data['pct_manufacturing'] + data['pct_transportation_and_utilities']
# data['service_industries'] = data['pct_arts_entertainment_and_accommodation_services'] + data['pct_retail_trade'] + data['pct_wholesale_trade'] + data['pct_public_administration']
# data['white_collar_industries'] = data['pct_finance_insurance_and_real_estate'] + data['pct_information'] + data['pct_professional_management_and_scientific_services'] + data['pct_educational_and_health_services']

In [308]:
data = data.drop(
    columns=[
        "pct_agriculture_hunting_and_mining",
        "pct_construction",
        "pct_manufacturing",
        "pct_wholesale_trade",
        "pct_retail_trade",
        "pct_transportation_and_utilities",
        "pct_information",
        # "pct_finance_insurance_and_real_estate",
        # "pct_professional_management_and_scientific_services",
        "pct_educational_and_health_services",
        "pct_arts_entertainment_and_accommodation_services",
        "pct_public_administration",
        "pct_other_services",
    ]
)

In [309]:
# df['avg_blue_collar_industries'][df['avg_blue_collar_industries'] < 0] = 0
# df['avg_service_industries'][df['avg_service_industries'] < 0] = 0
# df['avg_white_collar_industries'][df['avg_white_collar_industries'] < 0] = 0

## Combining Food Features

In [310]:
data[['food_environment_index', 'food_insecurity', 'limited_access_to_healthy_foods']].head(20)

Unnamed: 0,food_environment_index,food_insecurity,limited_access_to_healthy_foods
0,7.219912,0.13821,0.152482
1,7.651281,0.13214,0.037806
2,5.436531,0.22355,0.094833
3,7.649525,0.15697,0.133676
4,8.4289,0.12069,0.125429
5,4.230172,0.23542,0.198776
6,6.228152,0.21768,0.028559
7,6.462178,0.17231,0.141579
8,6.125775,0.19655,0.055599
9,8.000923,0.14718,0.002399


In [311]:
data['food_access_score'] = (data['food_insecurity'] + data['limited_access_to_healthy_foods'] + data['food_environment_index']) / 3

In [312]:
data.drop(['food_insecurity', 'limited_access_to_healthy_foods', 'food_environment_index'], axis=1, inplace=True)

## Combining Exercise Features





In [313]:
data[['access_to_exercise_opportunities', 'physical_inactivity']].head(20)

Unnamed: 0,access_to_exercise_opportunities,physical_inactivity
0,0.6907,0.3218
1,0.6523,0.25185
2,0.3645,0.33396
3,0.3021,0.36869
4,0.2168,0.34918
5,0.2209,0.31122
6,0.4778,0.36583
7,0.4936,0.32837
8,0.5082,0.35588
9,0.2604,0.34774


In [314]:
for index, row in data.iterrows():
    data.loc[index, "exercise_access_score"] = (
        row["physical_inactivity"] + 
        row["access_to_exercise_opportunities"]) / 2

In [315]:
data = data.drop(['access_to_exercise_opportunities', 'physical_inactivity'], axis=1)

## Forming Diversity Index

In [316]:
# for index, row in data.iterrows():
#     data.loc[index, "diversity_index"] = 1 - (
#         row["pct_hispanic"] ** 2
#         + row["pct_non_hispanic_white"] ** 2
#         + row["pct_non_hispanic_black"] ** 2
#         + row["pct_indigenous"] ** 2
#         + row["pct_asian"] ** 2
#         + row["pct_nhpi"] ** 2
#     )

In [317]:
data = data.drop(
    columns=[
        "pct_indigenous",
        "pct_nhpi",
        "pct_non_hispanic_white",
    ]
)

## Combining Clinicians

In [318]:
data['poor_health'] = (data['poor_mental_health_days'] + data['poor_physical_health_days'] + data['poor_or_fair_health']) / 3

In [319]:
data = data.drop(
    columns=[
        "poor_mental_health_days",
        "poor_physical_health_days",
        "poor_or_fair_health",
    ]
)

## Remaining Features

In [320]:
data.columns.tolist()

['geo_code',
 'year',
 'geo_full_name',
 'geo_name',
 'state',
 'state_code',
 'adult_obesity',
 'adult_smoking',
 'air_pollution_particulate_matter',
 'children_in_poverty',
 'college_completion',
 'dentists',
 'diabetes_prevalence',
 'drinking_water_violations',
 'driving_alone_to_work',
 'excessive_drinking',
 'frequent_mental_distress',
 'gender_pay_gap',
 'homeownership',
 'income_inequality',
 'insufficient_sleep',
 'life_expectancy',
 'low_birthweight',
 'mammography_screening',
 'median_household_income',
 'mental_health_providers',
 'motor_vehicle_crash_deaths',
 'pcp',
 'pct_65_and_older',
 'pct_asian',
 'pct_female',
 'pct_finance_insurance_and_real_estate',
 'pct_hispanic',
 'pct_non_hispanic_black',
 'pct_not_proficient_in_english',
 'pct_professional_management_and_scientific_services',
 'pct_rural',
 'pct_under_18',
 'premature_death',
 'preventable_hospital_stays',
 'severe_housing_problems',
 'sexually_transmitted_infections',
 'single_parent_households',
 'social_asso

# Variance Inflation Factor (VIF)

In [321]:
numeric_data = data.select_dtypes(include=[np.number])
X = numeric_data.drop(columns =
    ["life_expectancy", "year"], axis=1
)  # Drop the target variable if it’s in the DataFrame
X = StandardScaler().fit_transform(X)  # Standardize features

vif_data = pd.DataFrame()
vif_data["feature"] = numeric_data.columns.drop(["life_expectancy", "year"])
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Display the VIF values
pd.DataFrame(vif_data).sort_values(by="VIF", ascending=False)

Unnamed: 0,feature,VIF
17,median_household_income,5.661567
4,college_completion,5.198475
10,frequent_mental_distress,5.119912
14,insufficient_sleep,4.126679
26,pct_non_hispanic_black,4.06311
21,pct_65_and_older,3.949603
25,pct_hispanic,3.652121
3,children_in_poverty,3.64946
43,poor_health,3.556
29,pct_rural,3.45704


In [322]:
# Calculating the correlation matrix between life expectancy and all other numeric variables
correlation_matrix = numeric_data.corr()

# Selecting correlations with life expectancy and sorting them
life_expectancy_corr = correlation_matrix["life_expectancy"].sort_values(
    ascending=False
)

# Display the top correlations (both positive and negative) with life expectancy
top_positive_corr = life_expectancy_corr.head(20)
top_negative_corr = life_expectancy_corr.tail(20)

pd.DataFrame(top_positive_corr)

Unnamed: 0,life_expectancy
life_expectancy,1.0
college_completion,0.56932
median_household_income,0.529999
food_access_score,0.448931
excessive_drinking,0.429137
pct_asian,0.317789
pct_finance_insurance_and_real_estate,0.305022
pct_professional_management_and_scientific_services,0.294297
mammography_screening,0.281534
dentists,0.218323


In [323]:
pd.DataFrame(top_negative_corr)

Unnamed: 0,life_expectancy
pct_rural,-0.186146
teen_births,-0.205568
violent_crime,-0.236721
income_inequality,-0.258585
uninsured_adults,-0.271751
driving_alone_to_work,-0.314691
unemployment_rate,-0.328807
pct_non_hispanic_black,-0.355995
sexually_transmitted_infections,-0.362211
single_parent_households,-0.436051


In [324]:
data.head()

Unnamed: 0,geo_code,year,geo_full_name,geo_name,state,state_code,adult_obesity,adult_smoking,air_pollution_particulate_matter,children_in_poverty,college_completion,dentists,diabetes_prevalence,drinking_water_violations,driving_alone_to_work,excessive_drinking,frequent_mental_distress,gender_pay_gap,homeownership,income_inequality,insufficient_sleep,life_expectancy,low_birthweight,mammography_screening,median_household_income,mental_health_providers,motor_vehicle_crash_deaths,pcp,pct_65_and_older,pct_asian,pct_female,pct_finance_insurance_and_real_estate,pct_hispanic,pct_non_hispanic_black,pct_not_proficient_in_english,pct_professional_management_and_scientific_services,pct_rural,pct_under_18,premature_death,preventable_hospital_stays,severe_housing_problems,sexually_transmitted_infections,single_parent_households,social_associations,teen_births,unemployment_rate,uninsured_adults,violent_crime,state_fips,region,division,pop_quantiles_by_year,income_quantiles_by_year,food_access_score,exercise_access_score,poor_health
0,1001,2010,"Autauga County, Alabama",Autauga,Alabama,AL,0.3,0.2814,12.57,0.138,0.217477,0.000262,0.11818,0.0,0.868566,0.15404,0.135139,66.048439,0.77518,3.930128,0.369589,75.607833,0.0883,0.658644,53255.0,5e-05,28.8,52.224775,0.114006,0.006547,0.515003,0.065322,0.023196,0.178215,0.009389,0.069964,0.441005,0.27702,9778.1,91.803383,0.096612,370.0,0.179785,13.455842,47.513321,0.062076,0.14,256.487012,1,South,East South Central,3,4,2.503535,0.50625,3.292867
1,1003,2010,"Baldwin County, Alabama",Baldwin,Alabama,AL,0.245,0.2309,11.5998,0.1933,0.268221,0.000464,0.10766,0.253189,0.835134,0.190255,0.136337,68.766137,0.767301,4.21444,0.335911,77.707111,0.0877,0.688273,50147.0,0.000326,23.6,70.74137,0.164912,0.006536,0.51134,0.068492,0.038955,0.093793,0.022074,0.098688,0.500277,0.233084,8221.7,65.391299,0.116501,245.9,0.138206,11.480511,31.854171,0.065605,0.209,171.623233,1,South,East South Central,4,3,2.607076,0.452075,2.587933
2,1005,2010,"Barbour County, Alabama",Barbour,Alabama,AL,0.364,0.2268,12.0202,0.341,0.135124,0.000361,0.15103,0.187097,0.789877,0.110813,0.161515,74.681632,0.680449,5.835112,0.39882,75.719796,0.11,0.61296,33219.0,7.6e-05,30.4,40.744262,0.137406,0.001661,0.471028,0.037945,0.045958,0.464674,0.021339,0.039156,0.695329,0.224918,10686.1,92.297198,0.15804,585.7,0.225625,8.042733,87.486157,0.095894,0.151,64.00531,1,South,East South Central,2,0,1.918305,0.34923,3.398067
3,1007,2010,"Bibb County, Alabama",Bibb,Alabama,AL,0.317,0.2746,11.6488,0.242,0.100252,0.000211,0.12091,0.0,0.849911,0.138328,0.148993,69.83018,0.82919,4.332317,0.393172,73.576596,0.0911,0.59406,41770.0,7e-05,41.3,42.105263,0.12437,0.001592,0.462096,0.057236,0.005927,0.152587,0.008245,0.046746,0.805111,0.234321,13069.5,96.794256,0.066255,284.0,0.187878,9.764098,30.716724,0.091338,0.177,127.580788,1,South,East South Central,2,2,2.646724,0.335395,3.237233
4,1009,2010,"Blount County, Alabama",Blount,Alabama,AL,0.315,0.2342,12.5406,0.185,0.125272,0.000152,0.12406,0.031967,0.806364,0.055292,0.146638,73.733161,0.820044,4.310364,0.368693,75.623747,0.0726,0.610489,45549.0,2.5e-05,28.6,19.987281,0.142066,0.004339,0.504692,0.046264,0.074896,0.011554,0.042746,0.066947,0.907806,0.247795,8929.7,102.021063,0.068867,85.1,0.142587,8.418282,36.640361,0.07527,0.208,93.782854,1,South,East South Central,3,3,2.891673,0.28299,3.446433


In [325]:
data.to_csv('data/chr_census_2010_2024_cleaned.csv')

# Feature Selection

In [None]:
X = data.drop(columns=['life_expectancy', 'geo_code', 'year', 'geo_full_name', 'geo_name', 'state', 'state_code', 'region', 'division'])
y = data['life_expectancy']

# Standardize features (important for some models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_non_neg = data.drop(columns=['life_expectancy', 'geo_code', 'year', 'geo_full_name', 'geo_name', 'state', 'state_code', 'region', 'division', 'gender_pay_gap'])

## SelectKBest Feature Selection

1.   List item
2.   List item



In [None]:
# 1. Univariate Selection (SelectKBest using chi-squared)
select_kbest = SelectKBest(f_regression, k=10)
X_new_kbest = select_kbest.fit_transform(X_non_neg, y)
print("Selected features (SelectKBest):", X_non_neg.columns[select_kbest.get_support()])

Selected features (SelectKBest): Index(['income_quantiles_by_year', 'diabetes_prevalence', 'frequent_mental_distress',
       'children_in_poverty', 'college_completion', 'median_household_income', 'adult_smoking',
       'insufficient_sleep', 'poor_or_fair_health', 'premature_death'],
      dtype='object')


## Recursive Feature Elimination

In [None]:
# 2. Recursive Feature Elimination (RFE)
model_rfe = LinearRegression()
rfe = RFE(model_rfe, n_features_to_select=10)
X_new_rfe = rfe.fit_transform(X_scaled, y)
print("Selected features (RFE):", X.columns[rfe.support_])

Selected features (RFE): Index(['income_quantiles_by_year', 'college_completion', 'injury_deaths', 'pct_65_and_older',
       'pct_not_proficient_in_english', 'pct_rural', 'insufficient_sleep', 'driving_alone_to_work',
       'food_access_score', 'premature_death'],
      dtype='object')


## Random Forest Regressor for Feature Selection

In [None]:
# RandomForest for feature selection
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_scaled, y)

# Get feature importance from the trained RandomForest model
feature_importances = model_rf.feature_importances_

# Sort the feature importances in descending order and select the top 2 features
indices_rf = np.argsort(feature_importances)[::-1][:10]

# Print the selected features
print("Selected features (RandomForest):", X.columns[indices_rf])

Selected features (RandomForest): Index(['premature_death', 'driving_alone_to_work', 'insufficient_sleep', 'poor_or_fair_health',
       'college_completion', 'adult_obesity', 'pct_65_and_older', 'preventable_hospital_stays',
       'pct_female', 'diabetes_prevalence'],
      dtype='object')


## L1-Based Feature Selection

In [None]:
from sklearn.linear_model import Lasso

# 4. L1-based Feature Selection (Lasso Regression with L1 penalty)
model_l1 = Lasso(alpha=0.1)  # 'alpha' is the regularization strength, equivalent to 'C' in LogisticRegression
model_l1.fit(X_scaled, y)

# Get the absolute value of the coefficients
coef_l1 = np.abs(model_l1.coef_)

# Sort the coefficients in descending order and select the top 2 features
indices_l1 = np.argsort(coef_l1)[::-1][:10]

# Print the selected features
print("Selected features (L1-based Lasso Regression):", X.columns[indices_l1])

Selected features (L1-based Lasso Regression): Index(['premature_death', 'driving_alone_to_work', 'college_completion', 'adult_obesity',
       'pct_65_and_older', 'income_quantiles_by_year', 'insufficient_sleep',
       'single_parent_households', 'median_household_income', 'adult_smoking'],
      dtype='object')


## L2-Based Feature Selection

In [None]:
from sklearn.linear_model import Ridge

# L2-based Feature Selection (Ridge Regression with L2 penalty)
model_l2 = Ridge(alpha=0.1)  # 'alpha' is the regularization strength, similar to 'C' in LogisticRegression
model_l2.fit(X_scaled, y)

# Get the absolute value of the coefficients
coef_l2 = np.abs(model_l2.coef_)

# Sort the coefficients in descending order and select the top 2 features
indices_l2 = np.argsort(coef_l2)[::-1][:10]

# Print the selected features
print("Selected features (L2-based Ridge Regression):", X.columns[indices_l2])

Selected features (L2-based Ridge Regression): Index(['premature_death', 'pct_non_hispanic_white', 'college_completion', 'pct_65_and_older',
       'injury_deaths', 'driving_alone_to_work', 'income_quantiles_by_year',
       'single_parent_households', 'food_access_score', 'insufficient_sleep'],
      dtype='object')


## Gradient Boosting Regressor for Feature Selection

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# GradientBoosting for feature selection
model_gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
model_gb.fit(X_scaled, y)

# Get feature importance from the trained GradientBoosting model
feature_importances_gb = model_gb.feature_importances_

# Sort the feature importances in descending order and select the top 2 features
indices_gb = np.argsort(feature_importances_gb)[::-1][:10]

# Print the selected features
print("Selected features (GradientBoosting):", X.columns[indices_gb])

Selected features (GradientBoosting): Index(['premature_death', 'driving_alone_to_work', 'college_completion',
       'income_quantiles_by_year', 'adult_obesity', 'poor_or_fair_health', 'insufficient_sleep',
       'pct_65_and_older', 'single_parent_households', 'diabetes_prevalence'],
      dtype='object')


## AdaBoost Regressor for Feature Selection

In [None]:
from sklearn.ensemble import AdaBoostRegressor

# AdaBoost for feature selection
model_ab = AdaBoostRegressor(n_estimators=100, random_state=42)
model_ab.fit(X_scaled, y)

# Get feature importance from the trained AdaBoost model
feature_importances_ab = model_ab.feature_importances_

# Sort the feature importances in descending order and select the top 2 features
indices_ab = np.argsort(feature_importances_ab)[::-1][:10]

# Print the selected features
print("Selected features (AdaBoost):", X.columns[indices_ab])

Selected features (AdaBoost): Index(['premature_death', 'driving_alone_to_work', 'median_household_income', 'injury_deaths',
       'social_associations', 'gender_pay_gap', 'air_pollution_particulate_matter',
       'mammography_screening', 'uninsured_adults', 'pct_finance_insurance_and_real_estate'],
      dtype='object')


# Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to the scaled dataset
pca = PCA(n_components=None)  # We use all components to get feature contributions
pca.fit(X_scaled)

# Get the absolute values of the PCA components (feature importance for each principal component)
component_weights = np.abs(pca.components_)

# Sum the absolute contributions of each feature across all components
feature_importance = np.sum(component_weights, axis=0)

# Get the indices of the top 10 most important features based on PCA
top_15_indices = np.argsort(feature_importance)[::-1][:10]

# Print the names of the top 10 selected features
print("Selected top 15 features based on PCA:", X.columns[top_15_indices])

Selected top 15 features based on PCA: Index(['diabetes_prevalence', 'driving_alone_to_work', 'injury_deaths', 'pct_under_18',
       'homeownership', 'diversity_index', 'excessive_drinking', 'sexually_transmitted_infections',
       'adult_smoking', 'air_pollution_particulate_matter'],
      dtype='object')


In [None]:
results_df = pd.DataFrame({
    'KBest': X_non_neg.columns[select_kbest.get_support()],
    'Recursive Feature Elimination': X.columns[rfe.support_],
    'Random Forest': X.columns[indices_rf],
    'Lasso': X.columns[indices_l1],
    'Ridge': X.columns[indices_l2],
    'Gradient Boosting': X.columns[indices_gb],
    'AdaBoost': X.columns[indices_ab],
    'PCA': X.columns[top_15_indices]
})

results_df

Unnamed: 0,KBest,Recursive Feature Elimination,Random Forest,Lasso,Ridge,Gradient Boosting,AdaBoost,PCA
0,income_quantiles_by_year,income_quantiles_by_year,premature_death,premature_death,premature_death,premature_death,premature_death,diabetes_prevalence
1,diabetes_prevalence,college_completion,driving_alone_to_work,driving_alone_to_work,pct_non_hispanic_white,driving_alone_to_work,driving_alone_to_work,driving_alone_to_work
2,frequent_mental_distress,injury_deaths,insufficient_sleep,college_completion,college_completion,college_completion,median_household_income,injury_deaths
3,children_in_poverty,pct_65_and_older,poor_or_fair_health,adult_obesity,pct_65_and_older,income_quantiles_by_year,injury_deaths,pct_under_18
4,college_completion,pct_not_proficient_in_english,college_completion,pct_65_and_older,injury_deaths,adult_obesity,social_associations,homeownership
5,median_household_income,pct_rural,adult_obesity,income_quantiles_by_year,driving_alone_to_work,poor_or_fair_health,gender_pay_gap,diversity_index
6,adult_smoking,insufficient_sleep,pct_65_and_older,insufficient_sleep,income_quantiles_by_year,insufficient_sleep,air_pollution_particulate_matter,excessive_drinking
7,insufficient_sleep,driving_alone_to_work,preventable_hospital_stays,single_parent_households,single_parent_households,pct_65_and_older,mammography_screening,sexually_transmitted_infections
8,poor_or_fair_health,food_access_score,pct_female,median_household_income,food_access_score,single_parent_households,uninsured_adults,adult_smoking
9,premature_death,premature_death,diabetes_prevalence,adult_smoking,insufficient_sleep,diabetes_prevalence,pct_finance_insurance_and_real_estate,air_pollution_particulate_matter


# Recheck VIF of Subseets

In [None]:
results_df.KBest.values.tolist()

['income_quantiles_by_year',
 'diabetes_prevalence',
 'frequent_mental_distress',
 'low_birthweight',
 'children_in_poverty',
 'college_completion',
 'injury_deaths',
 'median_household_income',
 'motor_vehicle_crash_deaths',
 'adult_smoking',
 'insufficient_sleep',
 'poor_mental_health_days',
 'poor_or_fair_health',
 'adult_obesity',
 'premature_death']

In [None]:
kBest = data[results_df.KBest.values.tolist()]

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = kBest.columns
vif_data["VIF"] = [variance_inflation_factor(kBest.values, i) for i in range(len(kBest.columns))]

# Display the VIF values
pd.DataFrame(vif_data).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
7,insufficient_sleep,161.718858
2,frequent_mental_distress,107.170764
5,median_household_income,62.932622
8,poor_or_fair_health,36.260263
1,diabetes_prevalence,33.087783
6,adult_smoking,29.183371
9,premature_death,25.721042
3,children_in_poverty,14.733026
4,college_completion,13.516481
0,income_quantiles_by_year,13.023369


In [None]:
rfe = data[results_df['Recursive Feature Elimination'].values.tolist()]

In [None]:
vif_rfe = pd.DataFrame()
vif_rfe["feature"] = rfe.columns
vif_rfe["VIF"] = [variance_inflation_factor(rfe.values, i) for i in range(len(rfe.columns))]

# Display the VIF values
pd.DataFrame(vif_rfe).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
6,insufficient_sleep,82.913766
7,driving_alone_to_work,78.783496
9,premature_death,34.237653
2,injury_deaths,24.08352
3,pct_65_and_older,21.736601
8,food_access_score,14.689609
1,college_completion,10.345987
5,pct_rural,7.74355
0,income_quantiles_by_year,6.28901
4,pct_not_proficient_in_english,1.549566


In [None]:
rf = data[results_df['Random Forest'].values.tolist()]

In [None]:
vif_rf = pd.DataFrame()
vif_rf["feature"] = rf.columns
vif_rf["VIF"] = [variance_inflation_factor(rf.values, i) for i in range(len(rf.columns))]

# Display the VIF values
pd.DataFrame(vif_rf).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
8,pct_female,242.345035
2,insufficient_sleep,133.585052
1,driving_alone_to_work,120.27582
5,adult_obesity,82.131397
9,diabetes_prevalence,41.422014
3,poor_or_fair_health,28.911754
0,premature_death,22.997222
6,pct_65_and_older,18.448861
4,college_completion,11.887517
7,preventable_hospital_stays,1.884517


In [None]:
lasso = data[results_df['Lasso'].values.tolist()]

In [None]:
vif_lasso = pd.DataFrame()
vif_lasso["feature"] = lasso.columns
vif_lasso["VIF"] = [variance_inflation_factor(lasso.values, i) for i in range(len(lasso.columns))]

# Display the VIF values
pd.DataFrame(vif_lasso).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
6,insufficient_sleep,117.99418
1,driving_alone_to_work,87.406984
3,adult_obesity,70.091844
8,median_household_income,61.213419
9,adult_smoking,28.342999
0,premature_death,24.084653
4,pct_65_and_older,15.157413
2,college_completion,13.313224
5,income_quantiles_by_year,11.552403
7,single_parent_households,7.134003


In [None]:
ridge = data[results_df['Ridge'].values.tolist()]

In [None]:
vif_ridge = pd.DataFrame()
vif_ridge["feature"] = ridge.columns
vif_ridge["VIF"] = [variance_inflation_factor(ridge.values, i) for i in range(len(ridge.columns))]

# Display the VIF values
pd.DataFrame(vif_ridge).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
5,driving_alone_to_work,99.48638
9,insufficient_sleep,81.549976
0,premature_death,34.734005
1,pct_non_hispanic_white,25.501041
4,injury_deaths,23.508818
3,pct_65_and_older,21.34292
8,food_access_score,15.108516
2,college_completion,9.059944
7,single_parent_households,7.832331
6,income_quantiles_by_year,6.338413


In [None]:
adaboost = data[results_df['AdaBoost'].values.tolist()]

In [None]:
vif_adaboost = pd.DataFrame()
vif_adaboost["feature"] = adaboost.columns
vif_adaboost["VIF"] = [variance_inflation_factor(adaboost.values, i) for i in range(len(adaboost.columns))]

# Display the VIF values
pd.DataFrame(vif_adaboost).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
1,driving_alone_to_work,68.678513
6,air_pollution_particulate_matter,31.300758
0,premature_death,26.450924
7,mammography_screening,25.980508
3,injury_deaths,18.353228
2,median_household_income,18.140412
9,pct_finance_insurance_and_real_estate,8.764369
8,uninsured_adults,7.05262
4,social_associations,5.398419
5,gender_pay_gap,1.005325


In [None]:
gboost = data[results_df['Gradient Boosting'].values.tolist()]

In [None]:
vif_gb = pd.DataFrame()
vif_gb["feature"] = gboost.columns
vif_gb["VIF"] = [variance_inflation_factor(gboost.values, i) for i in range(len(gboost.columns))]

# Display the VIF values
pd.DataFrame(vif_gb).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
6,insufficient_sleep,121.454848
1,driving_alone_to_work,86.152351
4,adult_obesity,75.176605
9,diabetes_prevalence,42.771325
5,poor_or_fair_health,31.732983
0,premature_death,23.697245
7,pct_65_and_older,15.691464
2,college_completion,9.516644
8,single_parent_households,7.126569
3,income_quantiles_by_year,6.790138


In [None]:
pca = data[results_df['PCA'].values.tolist()]

In [None]:
vif_pca = pd.DataFrame()
vif_pca["feature"] = pca.columns
vif_pca["VIF"] = [variance_inflation_factor(pca.values, i) for i in range(len(pca.columns))]

# Display the VIF values
pd.DataFrame(vif_pca).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
1,driving_alone_to_work,118.722211
4,homeownership,100.045335
3,pct_under_18,42.028095
0,diabetes_prevalence,28.63718
9,air_pollution_particulate_matter,27.512726
8,adult_smoking,21.91814
6,excessive_drinking,15.362609
2,injury_deaths,14.426886
5,diversity_index,5.544201
7,sexually_transmitted_infections,4.969056
