<h1><center><font size="6">Feature Engineering Notebook</font></center></h1>

# <a id='0'>Table of Contents</a>

- <a href='#1'>Importing Packages</a>
- <a href='#2'>Importing Datasets</a>
- <a href='#3'>Initial VIF and Correlations</a>
- <a href='#4'>Correlations</a>
- <a href='#5'>Eliminating Redundant Features</a>
- <a href='#6'>Feature Engineering</a>
    - <a href='#71'>Industry Categories</a>
    - <a href='#72'>Combining Food Features</a>
    - <a href='#73'>Combining Exercise Features</a>
    - <a href='#74'>Forming Diversity Index</a>
    - <a href='#75'>Combining Health Features</a>
    - <a href='#76'>Remaining Features</a>
- <a href='#8'>Final VIF and Correlations</a>
 

# Importing Packages

In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, calinski_harabasz_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint
from sklearn.exceptions import FitFailedWarning
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import f_regression


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# pd.reset_option("all")


import warnings
warnings.filterwarnings("ignore", category=FitFailedWarning)

from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")


import statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Importing Datasets

In [72]:
df = pd.read_csv('data/chr_census_dataset.csv', index_col=0, dtype={"geo_code": str, "year": int})
df.head()

Unnamed: 0,geo_code,year,geo_full_name,geo_name,state,state_code,access_to_exercise_opportunities,adult_obesity,adult_smoking,air_pollution_particulate_matter,alcohol_impaired_driving_deaths,children_eligible_for_free_lunch,children_in_poverty,children_in_single_parent_households,college_completion,dentists,diabetes_prevalence,drinking_water_violations,driving_alone_to_work,excessive_drinking,food_environment_index,food_insecurity,frequent_mental_distress,frequent_physical_distress,gender_pay_gap,gini_index,high_school_completion,high_school_graduation,hiv_prevalence,homeownership,housing_units,hu_density,income_inequality,injury_deaths,insufficient_sleep,latitude,life_expectancy,limited_access_to_healthy_foods,long_commute_driving_alone,longitude,low_birthweight,mammography_screening,median_age,median_household_income,mental_health_providers,mobility_index_std,motor_vehicle_crash_deaths,other_pcp,pcp,pct_65_and_older,pct_age_18_44,pct_age_45_64,pct_agriculture_hunting_and_mining,pct_aian,pct_arts_entertainment_and_accommodation_services,pct_asian,pct_construction,pct_educational_and_health_services,pct_female,pct_female_pop_0_17,pct_female_pop_18_44,pct_female_pop_45_64,pct_female_pop_65+,pct_finance_insurance_and_real_estate,pct_hispanic,pct_households_with_high_housing_costs,pct_households_with_lack_of_kitchen_or_plumbing_facilities,pct_households_with_overcrowding,pct_indigenous,pct_information,pct_male_pop_0_17,pct_male_pop_18_44,pct_male_pop_45_64,pct_male_pop_65+,pct_manufacturing,pct_mgmt_bus_sci_art_occupations,pct_moved_from_abroad,pct_moved_from_diff_county_diff_state,pct_moved_from_diff_county_same_state,pct_moved_within_same_county,pct_nat_res_constr_maint_occupations,pct_nhpi,pct_non_hispanic_black,pct_non_hispanic_white,pct_not_proficient_in_english,pct_other_services,pct_prod_transp_mat_moving_occupations,pct_professional_management_and_scientific_services,pct_public_administration,pct_retail_trade,pct_rural,pct_sales_and_office_occupations,pct_service_occupations,pct_transportation_and_utilities,pct_under_18,pct_wholesale_trade,physical_inactivity,poor_mental_health_days,poor_or_fair_health,poor_physical_health_days,pop_density,population,poverty,premature_age_adjusted_mortality,premature_death,preventable_hospital_stays,ratio_of_pop_to_dentists,ratio_of_pop_to_mental_health,ratio_of_pop_to_pcp,ratio_of_pop_to_pcp_other_than_physicians,severe_housing_cost_burden,severe_housing_problems,sexually_transmitted_infections,single_parent_households,social_associations,some_college,teen_births,unemployment_rate,uninsured,uninsured_adults,uninsured_children,violent_crime
0,1001,2010,"Autauga County, Alabama",Autauga,Alabama,AL,0.659871,0.3,0.2814,12.9412,0.282668,0.161946,0.138,0.295363,0.217477,0.000259,0.11896,0.0,0.868566,0.160546,7.259172,0.13667,0.128822,0.124827,66.048439,0.405,0.853323,0.71519,184.26061,0.77518,21530.0,36.219406,3.930128,73.1357,0.367626,32.536382,75.607833,0.102426,31.206695,-86.64449,0.0883,0.668719,36.2,53255.0,6.3e-05,16.51,28.8,0.000201,52.224775,0.114006,0.360926,0.248048,0.009449,0.004963,0.086767,0.006547,0.0774,0.174315,0.515003,0.264694,0.361352,0.24632,0.127635,0.065322,0.023196,0.14426,0.009737,0.002029,0.002897,0.011257,0.290109,0.360473,0.249884,0.099535,0.141284,0.293291,0.002,0.03,0.037,0.068,0.11569,0.0,0.178215,0.775524,0.009389,0.04552,0.131096,0.069964,0.116676,0.126659,0.422819,0.290703,0.169221,0.044246,0.27702,0.031141,0.296,4.14,0.2586,5.48,89.42139,53155.0,0.188822,440.531154,9778.1,91.803383,3922.127612,32069.013341,1914.8,5211.48703,0.14426,0.096612,370.0,0.179785,13.448252,0.501062,47.513321,0.062076,0.102987,0.14,0.028895,256.487012
1,1003,2010,"Baldwin County, Alabama",Baldwin,Alabama,AL,0.719029,0.245,0.2309,11.8769,0.311694,0.20436,0.193264,0.285221,0.268221,0.000478,0.11214,0.096951,0.835134,0.177384,7.839907,0.13262,0.133441,0.120058,68.766137,0.439,0.875757,0.670437,176.099286,0.767301,101093.0,63.589498,4.21444,73.58463,0.334602,30.659218,77.707111,0.05317,33.724273,-87.746067,0.0877,0.664884,41.0,50147.0,0.000788,21.87,23.6,0.000383,70.74137,0.164912,0.322036,0.279969,0.018619,0.007428,0.095237,0.006536,0.107106,0.179215,0.51134,0.224343,0.31629,0.286409,0.172958,0.068492,0.038955,0.174773,0.011083,0.004721,0.006064,0.01732,0.24223,0.328048,0.27323,0.156492,0.092078,0.315219,0.004,0.054,0.027,0.086,0.134832,3.4e-05,0.093793,0.840777,0.022074,0.0493,0.105578,0.098688,0.04595,0.137723,0.423983,0.268785,0.175586,0.05354,0.233084,0.036729,0.25143,4.06,0.1338,3.57,110.576018,175791.0,0.130669,361.512003,8221.7,65.391299,2090.375511,1453.563213,1413.6,2761.146058,0.174773,0.116501,245.9,0.138206,11.396557,0.57681,31.854171,0.065605,0.118486,0.209,0.04587,171.623233
2,1005,2010,"Barbour County, Alabama",Barbour,Alabama,AL,0.39743,0.364,0.2268,12.3622,0.437596,0.362199,0.341,0.519583,0.135124,0.000361,0.14666,0.246521,0.789877,0.127674,5.316227,0.22771,0.151423,0.159879,74.681632,0.464,0.718629,0.563422,379.029915,0.680449,12011.0,13.573725,5.835112,57.402919,0.394364,31.87067,75.719796,0.113274,26.762889,-85.405456,0.11,0.635756,38.0,33219.0,5e-05,20.08,30.4,0.000178,40.744262,0.137406,0.36745,0.270226,0.040973,0.005806,0.058432,0.001661,0.063478,0.150066,0.471028,0.23331,0.317621,0.280524,0.168544,0.037945,0.045958,0.171885,0.013783,0.003675,0.003141,0.008174,0.217445,0.411821,0.261057,0.109678,0.282571,0.265819,0.0,0.026,0.07,0.074,0.103441,0.000108,0.464674,0.472111,0.021339,0.03956,0.260773,0.039156,0.062872,0.124836,0.678638,0.213442,0.156524,0.079221,0.224918,0.012716,0.33765,3.84,0.2442,6.11,31.302857,27699.0,0.174806,447.146,10686.1,92.297198,2767.582778,22805.17,2454.333333,6964.169271,0.171885,0.15804,585.7,0.225625,8.38111,0.36485,87.486157,0.095894,0.150615,0.151,0.051303,64.00531
3,1007,2010,"Bibb County, Alabama",Bibb,Alabama,AL,0.324089,0.317,0.249137,12.7501,0.299896,0.161418,0.242,0.354949,0.100252,0.000179,0.12166,0.0,0.849911,0.122094,7.662539,0.15691,0.139013,0.136308,69.83018,0.411,0.745458,0.603834,105.184236,0.82919,8885.0,14.271285,4.332317,87.046578,0.395793,33.015893,73.576596,0.041115,43.250507,-87.127148,0.0911,0.610994,38.3,41770.0,7.3e-05,10.39,41.3,0.000354,42.105263,0.12437,0.379832,0.261477,0.038191,0.003612,0.039312,0.001592,0.106426,0.137387,0.462096,0.224445,0.355475,0.264261,0.155819,0.057236,0.005927,0.116541,0.001881,0.0,0.001769,0.008046,0.242805,0.400756,0.259086,0.097352,0.180059,0.200835,0.002,0.01,0.03,0.053,0.18566,0.0,0.152587,0.83609,0.008245,0.082595,0.252572,0.046746,0.039923,0.137081,0.697744,0.22874,0.132193,0.104288,0.234321,0.022711,0.3561,5.31,0.1817,4.22,36.316686,22610.0,0.173944,540.402163,13069.5,96.794256,5597.575,7277.42302,2375.0,5649.650535,0.116541,0.066255,284.0,0.187878,10.242723,0.322305,30.716724,0.091338,0.13513,0.177,0.034706,127.580788
4,1009,2010,"Blount County, Alabama",Blount,Alabama,AL,0.234746,0.315,0.2342,12.3118,0.178018,0.201278,0.185,0.252963,0.125272,0.00019,0.12005,0.005384,0.806364,0.088601,8.311729,0.12172,0.150667,0.137675,73.733161,0.422,0.746593,0.730135,78.381994,0.820044,23482.0,36.41906,4.310364,94.632943,0.367124,33.977448,75.623747,0.018181,62.551583,-86.567246,0.0726,0.607473,38.3,45549.0,4.6e-05,14.31,28.6,0.000115,19.987281,0.142066,0.345216,0.264923,0.031733,0.006218,0.064802,0.004339,0.109973,0.17518,0.504692,0.235216,0.335593,0.270131,0.159059,0.046264,0.074896,0.152476,0.007959,0.005047,0.003845,0.014248,0.260613,0.355021,0.259615,0.124751,0.167813,0.24658,0.006,0.013,0.037,0.071,0.157573,0.0,0.011554,0.894959,0.042746,0.058609,0.179268,0.066947,0.040759,0.11568,0.899569,0.256294,0.160285,0.069619,0.247795,0.038371,0.31445,4.47,0.2493,5.62,87.925617,56692.0,0.196195,446.840257,8929.7,102.021063,5381.771,21293.650834,5003.181818,11049.19419,0.152476,0.068867,85.1,0.142587,8.432691,0.382355,36.640361,0.07527,0.117416,0.208,0.046817,93.782854


In [73]:
df.columns.tolist()

['geo_code',
 'year',
 'geo_full_name',
 'geo_name',
 'state',
 'state_code',
 'access_to_exercise_opportunities',
 'adult_obesity',
 'adult_smoking',
 'air_pollution_particulate_matter',
 'alcohol_impaired_driving_deaths',
 'children_eligible_for_free_lunch',
 'children_in_poverty',
 'children_in_single_parent_households',
 'college_completion',
 'dentists',
 'diabetes_prevalence',
 'drinking_water_violations',
 'driving_alone_to_work',
 'excessive_drinking',
 'food_environment_index',
 'food_insecurity',
 'frequent_mental_distress',
 'frequent_physical_distress',
 'gender_pay_gap',
 'gini_index',
 'high_school_completion',
 'high_school_graduation',
 'hiv_prevalence',
 'homeownership',
 'housing_units',
 'hu_density',
 'income_inequality',
 'injury_deaths',
 'insufficient_sleep',
 'latitude',
 'life_expectancy',
 'limited_access_to_healthy_foods',
 'long_commute_driving_alone',
 'longitude',
 'low_birthweight',
 'mammography_screening',
 'median_age',
 'median_household_income',
 'me

# Initial VIF

In [74]:
numeric_data = df.select_dtypes(include=[np.number])
X = numeric_data.drop("life_expectancy", axis=1)  # Drop the target variable if it’s in the DataFrame
X = StandardScaler().fit_transform(X)  # Standardize features

vif_data = pd.DataFrame()
vif_data["feature"] = numeric_data.columns.drop("life_expectancy")
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Display the VIF values
pd.DataFrame(vif_data).sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF
44,pct_age_18_44,314.917265
77,pct_non_hispanic_white,202.65836
45,pct_age_45_64,171.178154
65,pct_male_pop_18_44,154.735859
54,pct_female_pop_18_44,131.22492
76,pct_non_hispanic_black,107.239764
43,pct_65_and_older,99.01937
58,pct_hispanic,93.137492
39,mobility_index_std,86.806845
67,pct_male_pop_65+,81.214402


# Correlations

In [75]:
numeric_data = df.select_dtypes(include=[np.number])

# Calculating the correlation matrix between life expectancy and all other numeric variables
correlation_matrix = numeric_data.corr()

# Selecting correlations with life expectancy and sorting them
life_expectancy_corr = correlation_matrix["life_expectancy"].sort_values(
    ascending=False
)

# Display the top correlations (both positive and negative) with life expectancy
top_positive_corr = life_expectancy_corr.head(15)
top_negative_corr = life_expectancy_corr.tail(15)

pd.DataFrame(top_positive_corr)

Unnamed: 0,life_expectancy
life_expectancy,1.0
college_completion,0.573507
some_college,0.549874
median_household_income,0.531112
high_school_completion,0.476117
pct_mgmt_bus_sci_art_occupations,0.454159
food_environment_index,0.438653
excessive_drinking,0.400704
latitude,0.346018
access_to_exercise_opportunities,0.323685


In [77]:
pd.DataFrame(top_negative_corr)

Unnamed: 0,life_expectancy
children_eligible_for_free_lunch,-0.551991
adult_obesity,-0.56459
insufficient_sleep,-0.572529
food_insecurity,-0.574466
poverty,-0.574923
adult_smoking,-0.580674
poor_physical_health_days,-0.591908
diabetes_prevalence,-0.595842
children_in_poverty,-0.599604
frequent_mental_distress,-0.603338


# Eliminating Redundant Features

- ```'children_eligible_for_free_lunch'``` and ```'children_in_poverty'``` are essentially the same since children in poverty are the very ones that would be eligible for free lunch; there is strong overlap with ```'children_in_single_parent_households'```, but is not synonymous with them.
- ```'gini_index'``` and ```'income_inequality'``` are essentially the same also
- ```'single_parent_households'``` and ```'children_in_single_parent_households'``` seem redundant, but would like input on that matter
- eliminate ```'poverty'``` as it is redundant with ```'children_in_poverty'```
- ```'pct_aapi'``` shoudl be removed since it is just an amalgam of ```'pct_asian'``` and ```'pct_nhpi'```
-  ```'high_school_completion'``` and ```'high_school_graduation'```, as well as ```'some_college'``` and ```'college_completion'```
-  feature enginneer industry categories:
    - ```'pct_agriculture_hunting_and_mining'```, ```'pct_arts_entertainment_and_accommodation_services'```, ```'pct_construction'```, ```'pct_educational_and_health_services'```, ```'pct_finance_insurance_and_real_estate'```, ```'pct_information'```,```'pct_manufacturing'```, ```'pct_public_administration'```, ```'pct_retail_trade'```, ```'pct_other_services'```, ```'pct_transportation_and_utilities'```, ```'pct_wholesale_trade'```, ```'pct_professional_management_and_scientific_services'```,
-  remove occupation categories (having both would be redundant):
    -  ```'pct_mgmt_bus_sci_art_occupations', 'pct_nat_res_constr_maint_occupations',  'pct_prod_transp_mat_moving_occupations', 'pct_sales_and_office_occupations', 'pct_service_occupations'```
- ```'severe_housing_problems'``` is a composite score of the following three:
```'pct_households_with_high_housing_costs','pct_households_with_lack_of_kitchen_or_plumbing_facilities','pct_households_with_overcrowding'```, so we keep one or the other



- Use the CHR website definitions in deciding whether to eliminate redundant features or combining features:
    - remaining Health Behaviors: ```'access_to_exercise_opportunities', 'adult_obesity', 'physical_inactivity', 'adult_smoking', 'alcohol_impaired_driving_deaths', 'excessive_drinking', 'food_environment_index', 'food_insecurity', 'limited_access_to_healthy_foods', 'insufficient_sleep',  'sexually_transmitted_infections', 'teen_births'```, here we have the food and inactivity/obesity features
        - ```'food_insecurity'``` and ```'limited_access_to_healthy_foods'``` are conceptually very similar, perhaps combining them would be best
        - ```'access_to_exercise_opportunities'``` and  ```'physical_inactivity'``` are conceptually similar also, where they are reporting two sides of the coin, where one is reporting no leisure-time physical inactivity and the other access to locations for physical activity
        - we could find the average of the ```'access_to_exercise_opportunities'``` and ```1 - 'physical_inactivity'```
    - Demographics: ```'pct_65_and_older', 'pct_asian', 'pct_female', 'pct_hispanic', 'pct_indigenous', 'pct_non_hispanic_black', 'pct_non_hispanic_white', 'pct_not_proficient_in_english', 'pct_rural', 'pct_under_18'```, we should drop one of the race categories at least and/or form a diversity index
        - keeping individual racial categories will allow us to tie one of the groups to greater life expectancy (we know ```pct_asian``` is more correlated with such counties, we would expect ```pct_white``` to be correlated due to greater wealth amongst those populations)
        - just like we don't have ```pct_urban``` and ```pct_rural``` and ```pct_male``` and ```pct_female```, keeping all the racial categories would create collinearity
    - Social and Economic Factors: ```'children_in_poverty', 'college_completion', 'gender_pay_gap', 'gini_index', 'high_school_completion', 'injury_deaths', 'median_household_income', 'severe_housing_cost_burden', 'single_parent_households', 'social_associations', 'unemployment_rate', 'violent_crime', 'motor_vehicle_crash_deaths```
    - Clinical Care:  ```'dentists', 'mammography_screening', 'mental_health_providers', 'other_pcp', 'pcp', 'preventable_hospital_stays','ratio_of_pop_to_dentists', 'ratio_of_pop_to_mental_health', 'ratio_of_pop_to_pcp', 'ratio_of_pop_to_pcp_other_than_physicians', 'uninsured_adults'``` - we can keep ```'ratio_of_pop...'``` and drop the following:  ```'dentists', 'pcp', 'mental_health_providers', 'other_pcp'```, as they are synonymous with each other
    - Physical Environment: ```'air_pollution_particulate_matter', 'drinking_water_violations', 'driving_alone_to_work', 'homeownership', 'long_commute_driving_alone', 'severe_housing_problems'```
        - here driving alone and long commute driving alone seems redundant, keep ```"driving_alone_to_work"```
    - Health Outcomes: ```'diabetes_prevalence', 'frequent_mental_distress', 'hiv_prevalence', 'low_birthweight', 'poor_mental_health_days', 'poor_or_fair_health', 'poor_physical_health_days',  'premature_age_adjusted_mortality', 'premature_death', 'life_expectancy'```, here we can combine or eliminate the poor health and the premature mortality/death features
        - I know Christine was highlighting the importance of psychological factors in life expectancy, we should keep ```"poor_mental_health_days"```
        - ```"poor_mental_health_days"``` and ```"poor_physical_health_days"``` are representatives of average number of days respondents reporting poor mental or physical health, while ```"poor_or_fair_health"``` is the percentage of adults reporting fair or poor health, eliminate ```"poor_physical_health_days"``` as combining ```"poor_physical_health_days"```
        - ```'premature_age_adjusted_mortality'``` and ```'premature_death'``` are the same concepts where one represents the average number of years lost and the other represents the number of deaths among residents under age 75 per 100,000 population, keep the latter

- Additional thoughts:
    - there could be strong overlap with adult obesity and diabetes prevalence
    - frequent_mental_distress and poor_mental_health_days could also have strong overlap

- Recent Changes:
    - instead of feature engineering the industry categories, I opted to keep the ones that were the most correlated with life expectancy, i.e. ```'pct_finance_insurance_and_real_estate'``` and ```'pct_professional_management_and_scientific_services'```
    - I also opted to keep the racial categories that are the most correlated with life expectancy:  ```'pct_non_hispanic_white', 'pct_hispanic', 'pct_asian'```
    - Upon closer inspection of the features, ```'ratio_to_pop_to_pcp', 'pcp', 'dentists', 'ratio_to_pop_to_dentists', 'mental_health_providers', 'ratio_of_pop_to_mental_health'``` are the same pairs of variables.















In [78]:
data = df.drop(
    columns=[
        "children_in_single_parent_households",
        "children_eligible_for_free_lunch",
        "poverty",
        # "pct_aapi",
        "pct_households_with_high_housing_costs",
        "pct_households_with_lack_of_kitchen_or_plumbing_facilities",
        "pct_households_with_overcrowding",
        "severe_housing_cost_burden",
        "gini_index",
        # "dentists",
        # "mental_health_providers",
        "other_pcp",
        # "pcp",
        "ratio_of_pop_to_dentists",
        "ratio_of_pop_to_mental_health",
        "ratio_of_pop_to_pcp",
        "ratio_of_pop_to_pcp_other_than_physicians",
        "long_commute_driving_alone",
        "pct_mgmt_bus_sci_art_occupations",
        "pct_service_occupations",
        "pct_sales_and_office_occupations",
        "pct_nat_res_constr_maint_occupations",
        "pct_prod_transp_mat_moving_occupations",
        "high_school_graduation",
        "some_college",
        "hiv_prevalence",
        "premature_age_adjusted_mortality",
        "alcohol_impaired_driving_deaths",
        "injury_deaths",
        "high_school_completion",
        "median_age"
    ]
)

# Feature Engineering

## Industry Categories

In [269]:
# data['blue_collar_industries'] = data['pct_agriculture_hunting_and_mining'] + data['pct_construction'] + data['pct_manufacturing'] + data['pct_transportation_and_utilities']
# data['service_industries'] = data['pct_arts_entertainment_and_accommodation_services'] + data['pct_retail_trade'] + data['pct_wholesale_trade'] + data['pct_public_administration']
# data['white_collar_industries'] = data['pct_finance_insurance_and_real_estate'] + data['pct_information'] + data['pct_professional_management_and_scientific_services'] + data['pct_educational_and_health_services']

In [79]:
data = data.drop(
    columns=[
        "pct_agriculture_hunting_and_mining",
        "pct_construction",
        "pct_manufacturing",
        "pct_wholesale_trade",
        "pct_retail_trade",
        "pct_transportation_and_utilities",
        "pct_information",
        # "pct_finance_insurance_and_real_estate",
        # "pct_professional_management_and_scientific_services",
        "pct_educational_and_health_services",
        "pct_arts_entertainment_and_accommodation_services",
        "pct_public_administration",
        "pct_other_services",
    ]
)

In [309]:
# df['avg_blue_collar_industries'][df['avg_blue_collar_industries'] < 0] = 0
# df['avg_service_industries'][df['avg_service_industries'] < 0] = 0
# df['avg_white_collar_industries'][df['avg_white_collar_industries'] < 0] = 0

## Combining Food Features

In [80]:
data[['food_environment_index', 'food_insecurity', 'limited_access_to_healthy_foods']].head(20)

Unnamed: 0,food_environment_index,food_insecurity,limited_access_to_healthy_foods
0,7.259172,0.13667,0.102426
1,7.839907,0.13262,0.05317
2,5.316227,0.22771,0.113274
3,7.662539,0.15691,0.041115
4,8.311729,0.12172,0.018181
5,3.896644,0.2452,0.202017
6,6.217646,0.22405,0.028306
7,6.450438,0.17418,0.105953
8,5.93939,0.20667,0.087866
9,8.051932,0.1478,0.001391


In [81]:
data['food_access_score'] = (data['food_insecurity'] + data['limited_access_to_healthy_foods'] + data['food_environment_index']) / 3

In [82]:
data.drop(['food_insecurity', 'limited_access_to_healthy_foods', 'food_environment_index'], axis=1, inplace=True)

## Combining Exercise Features





In [83]:
data[['access_to_exercise_opportunities', 'physical_inactivity']].head(20)

Unnamed: 0,access_to_exercise_opportunities,physical_inactivity
0,0.659871,0.296
1,0.719029,0.25143
2,0.39743,0.33765
3,0.324089,0.3561
4,0.234746,0.31445
5,0.247052,0.32024
6,0.480381,0.36486
7,0.508596,0.33374
8,0.548273,0.36018
9,0.292137,0.3517


In [84]:
for index, row in data.iterrows():
    data.loc[index, "exercise_access_score"] = (
        row["physical_inactivity"] + 
        row["access_to_exercise_opportunities"]) / 2

In [85]:
data = data.drop(['access_to_exercise_opportunities', 'physical_inactivity'], axis=1)

## Forming Diversity Index

In [86]:
for index, row in data.iterrows():
    data.loc[index, "diversity_index"] = 1 - (
        row["pct_hispanic"] ** 2
        + row["pct_non_hispanic_white"] ** 2
        + row["pct_non_hispanic_black"] ** 2
        + row["pct_indigenous"] ** 2
        + row["pct_asian"] ** 2
        + row["pct_nhpi"] ** 2
    )

In [87]:
data = data.drop(
    columns=[
        "pct_indigenous",
        "pct_nhpi",
        "pct_non_hispanic_white",
    ]
)

## Combining Clinicians

In [88]:
data['poor_health'] = (data['poor_mental_health_days'] + data['poor_physical_health_days'] + data['poor_or_fair_health']) / 3

In [89]:
data = data.drop(
    columns=[
        "poor_mental_health_days",
        "poor_physical_health_days",
        "poor_or_fair_health",
    ]
)

## Remaining Features

In [25]:
data.columns.tolist()

['geo_code',
 'year',
 'geo_full_name',
 'geo_name',
 'state',
 'state_code',
 'adult_obesity',
 'adult_smoking',
 'air_pollution_particulate_matter',
 'children_in_poverty',
 'college_completion',
 'dentists',
 'diabetes_prevalence',
 'drinking_water_violations',
 'driving_alone_to_work',
 'excessive_drinking',
 'frequent_mental_distress',
 'frequent_physical_distress',
 'gender_pay_gap',
 'homeownership',
 'housing_units',
 'hu_density',
 'income_inequality',
 'insufficient_sleep',
 'latitude',
 'life_expectancy',
 'longitude',
 'low_birthweight',
 'mammography_screening',
 'median_household_income',
 'mental_health_providers',
 'mobility_index_std',
 'motor_vehicle_crash_deaths',
 'pcp',
 'pct_65_and_older',
 'pct_age_18_44',
 'pct_age_45_64',
 'pct_aian',
 'pct_asian',
 'pct_female',
 'pct_female_pop_0_17',
 'pct_female_pop_18_44',
 'pct_female_pop_45_64',
 'pct_female_pop_65+',
 'pct_finance_insurance_and_real_estate',
 'pct_hispanic',
 'pct_male_pop_0_17',
 'pct_male_pop_18_44',


# Variance Inflation Factor (VIF)

In [90]:
data = data.drop(
    columns=[
        "pct_age_18_44",
        "pct_age_45_64",
        "pct_65_and_older",
        "pct_under_18",
        "pct_moved_from_diff_county_diff_state",
        "pct_moved_from_diff_county_same_state",
        "pct_moved_within_same_county",
        "pct_male_pop_18_44",
        "pct_female_pop_18_44",
        "pct_female_pop_65+",
        "pct_male_pop_65+",
        "pct_male_pop_0_17",
        "pct_female_pop_0_17",
        "pct_female_pop_45_64",
        "uninsured_adults",
        "frequent_physical_distress",
        "pct_non_hispanic_black",
        "diversity_index",
        "uninsured",
    ]
)

In [91]:
numeric_data = data.select_dtypes(include=[np.number])
X = numeric_data.drop(columns =
    ["life_expectancy", "year"], axis=1
)  # Drop the target variable if it’s in the DataFrame
X = StandardScaler().fit_transform(X)  # Standardize features

vif_data = pd.DataFrame()
vif_data["feature"] = numeric_data.columns.drop(["life_expectancy", "year"])
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Display the VIF values
pd.DataFrame(vif_data).sort_values(by="VIF", ascending=False)

Unnamed: 0,feature,VIF
36,pop_density,51.572174
14,hu_density,48.953206
13,housing_units,45.726183
37,population,44.638425
4,college_completion,5.248214
21,median_household_income,5.20953
10,frequent_mental_distress,4.728826
16,insufficient_sleep,3.958601
35,pct_rural,3.819973
30,pct_hispanic,3.697267


In [92]:
# Calculating the correlation matrix between life expectancy and all other numeric variables
correlation_matrix = numeric_data.corr()

# Selecting correlations with life expectancy and sorting them
life_expectancy_corr = correlation_matrix["life_expectancy"].sort_values(
    ascending=False
)

# Display the top correlations (both positive and negative) with life expectancy
top_positive_corr = life_expectancy_corr.head(20)
top_negative_corr = life_expectancy_corr.tail(20)

pd.DataFrame(top_positive_corr)

Unnamed: 0,life_expectancy
life_expectancy,1.0
college_completion,0.573507
median_household_income,0.531112
food_access_score,0.448437
excessive_drinking,0.400704
latitude,0.346018
pct_asian,0.320067
pct_finance_insurance_and_real_estate,0.310775
pct_professional_management_and_scientific_services,0.297328
mammography_screening,0.279283


In [93]:
pd.DataFrame(top_negative_corr)

Unnamed: 0,life_expectancy
preventable_hospital_stays,-0.170722
pct_rural,-0.187677
teen_births,-0.204047
pct_aian,-0.209122
violent_crime,-0.231742
income_inequality,-0.263399
driving_alone_to_work,-0.314795
unemployment_rate,-0.331319
motor_vehicle_crash_deaths,-0.372814
sexually_transmitted_infections,-0.373562


In [94]:
data.head()

Unnamed: 0,geo_code,year,geo_full_name,geo_name,state,state_code,adult_obesity,adult_smoking,air_pollution_particulate_matter,children_in_poverty,college_completion,dentists,diabetes_prevalence,drinking_water_violations,driving_alone_to_work,excessive_drinking,frequent_mental_distress,gender_pay_gap,homeownership,housing_units,hu_density,income_inequality,insufficient_sleep,latitude,life_expectancy,longitude,low_birthweight,mammography_screening,median_household_income,mental_health_providers,mobility_index_std,motor_vehicle_crash_deaths,pcp,pct_aian,pct_asian,pct_female,pct_finance_insurance_and_real_estate,pct_hispanic,pct_male_pop_45_64,pct_moved_from_abroad,pct_not_proficient_in_english,pct_professional_management_and_scientific_services,pct_rural,pop_density,population,premature_death,preventable_hospital_stays,severe_housing_problems,sexually_transmitted_infections,single_parent_households,social_associations,teen_births,unemployment_rate,uninsured_children,violent_crime,food_access_score,exercise_access_score,poor_health
0,1001,2010,"Autauga County, Alabama",Autauga,Alabama,AL,0.3,0.2814,12.9412,0.138,0.217477,0.000259,0.11896,0.0,0.868566,0.160546,0.128822,66.048439,0.77518,21530.0,36.219406,3.930128,0.367626,32.536382,75.607833,-86.64449,0.0883,0.668719,53255.0,6.3e-05,16.51,28.8,52.224775,0.004963,0.006547,0.515003,0.065322,0.023196,0.249884,0.002,0.009389,0.069964,0.422819,89.42139,53155.0,9778.1,91.803383,0.096612,370.0,0.179785,13.448252,47.513321,0.062076,0.028895,256.487012,2.499423,0.477935,3.292867
1,1003,2010,"Baldwin County, Alabama",Baldwin,Alabama,AL,0.245,0.2309,11.8769,0.193264,0.268221,0.000478,0.11214,0.096951,0.835134,0.177384,0.133441,68.766137,0.767301,101093.0,63.589498,4.21444,0.334602,30.659218,77.707111,-87.746067,0.0877,0.664884,50147.0,0.000788,21.87,23.6,70.74137,0.007428,0.006536,0.51134,0.068492,0.038955,0.27323,0.004,0.022074,0.098688,0.423983,110.576018,175791.0,8221.7,65.391299,0.116501,245.9,0.138206,11.396557,31.854171,0.065605,0.04587,171.623233,2.675232,0.48523,2.587933
2,1005,2010,"Barbour County, Alabama",Barbour,Alabama,AL,0.364,0.2268,12.3622,0.341,0.135124,0.000361,0.14666,0.246521,0.789877,0.127674,0.151423,74.681632,0.680449,12011.0,13.573725,5.835112,0.394364,31.87067,75.719796,-85.405456,0.11,0.635756,33219.0,5e-05,20.08,30.4,40.744262,0.005806,0.001661,0.471028,0.037945,0.045958,0.261057,0.0,0.021339,0.039156,0.678638,31.302857,27699.0,10686.1,92.297198,0.15804,585.7,0.225625,8.38111,87.486157,0.095894,0.051303,64.00531,1.885737,0.36754,3.398067
3,1007,2010,"Bibb County, Alabama",Bibb,Alabama,AL,0.317,0.249137,12.7501,0.242,0.100252,0.000179,0.12166,0.0,0.849911,0.122094,0.139013,69.83018,0.82919,8885.0,14.271285,4.332317,0.395793,33.015893,73.576596,-87.127148,0.0911,0.610994,41770.0,7.3e-05,10.39,41.3,42.105263,0.003612,0.001592,0.462096,0.057236,0.005927,0.259086,0.002,0.008245,0.046746,0.697744,36.316686,22610.0,13069.5,96.794256,0.066255,284.0,0.187878,10.242723,30.716724,0.091338,0.034706,127.580788,2.620188,0.340095,3.237233
4,1009,2010,"Blount County, Alabama",Blount,Alabama,AL,0.315,0.2342,12.3118,0.185,0.125272,0.00019,0.12005,0.005384,0.806364,0.088601,0.150667,73.733161,0.820044,23482.0,36.41906,4.310364,0.367124,33.977448,75.623747,-86.567246,0.0726,0.607473,45549.0,4.6e-05,14.31,28.6,19.987281,0.006218,0.004339,0.504692,0.046264,0.074896,0.259615,0.006,0.042746,0.066947,0.899569,87.925617,56692.0,8929.7,102.021063,0.068867,85.1,0.142587,8.432691,36.640361,0.07527,0.046817,93.782854,2.81721,0.274598,3.446433


In [95]:
data.to_csv('data/chr_census_cleaned.csv')