In [54]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(display="diagram")


import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

In [55]:
# Load the data
states = pd.read_csv("data/qol_states_2024.csv")
states.head()

Unnamed: 0,state,QualityOfLifeTotalScore,QualityOfLifeQualityOfLife,QualityOfLifeAffordability,QualityOfLifeEconomy,QualityOfLifeEducationAndHealth,QualityOfLifeSafety
0,Alabama,45.61,40,1,40,48,32
1,Alaska,40.93,50,42,22,30,45
2,Arizona,48.31,21,25,14,39,40
3,Arkansas,42.42,46,4,34,45,47
4,California,52.03,2,50,15,24,27


In [56]:
# simple cross validation
X = states.drop(columns=['state', 'QualityOfLifeTotalScore'], axis = 1)
y = states['QualityOfLifeTotalScore']
simple_X_train, simple_X_test, simple_y_train, simple_y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [57]:
from sklearn.linear_model import Ridge

# ridge model with standardization
ridge_param_dict = {'ridge__alpha': np.logspace(0, 10, 50)}
ridge_pipe = Pipeline([('scaler', StandardScaler()), 
                      ('ridge', Ridge())])
ridge_grid = GridSearchCV(ridge_pipe, param_grid=ridge_param_dict, scoring='neg_mean_squared_error')
ridge_grid.fit(simple_X_train, simple_y_train)
ridge_train_preds = ridge_grid.predict(simple_X_train)
ridge_test_preds = ridge_grid.predict(simple_X_test)
ridge_train_mse = mean_squared_error(simple_y_train, ridge_train_preds)
ridge_test_mse = mean_squared_error(simple_y_test, ridge_test_preds)

best_ridge_model = ridge_grid.best_estimator_.named_steps['ridge']
best_ridge_coefs = best_ridge_model.coef_
feature_names = ridge_grid.best_estimator_.named_steps['scaler'].get_feature_names_out()

ridge_df = pd.DataFrame({'feature': feature_names, 'coef': best_ridge_coefs})

print(f"Ridge simple cross validation Train MSE: {ridge_train_mse}")
print(f"Ridge simple cross validation Test MSE: {ridge_test_mse}")
print(f"Best Ridge Alpha: {ridge_grid.best_params_}")
ridge_df

Ridge simple cross validation Train MSE: 1.3942594497108203
Ridge simple cross validation Test MSE: 0.5383255811786083
Best Ridge Alpha: {'ridge__alpha': 1.0}


Unnamed: 0,feature,coef
0,QualityOfLifeQualityOfLife,-1.895126
1,QualityOfLifeAffordability,-1.821392
2,QualityOfLifeEconomy,-1.791605
3,QualityOfLifeEducationAndHealth,-2.783066
4,QualityOfLifeSafety,-3.151602


In [60]:
# k-fold cross validation
k_fold_model_finder = GridSearchCV(ridge_pipe, param_grid=ridge_param_dict, cv=5, scoring='neg_mean_squared_error' )
k_fold_model_finder.fit(X, y)
kf_ridge_train_preds = k_fold_model_finder.predict(simple_X_train)
kf_ridge_test_preds = k_fold_model_finder.predict(simple_X_test)
kf_ridge_train_mse = mean_squared_error(simple_y_train, kf_ridge_train_preds)
kf_ridge_test_mse = mean_squared_error(simple_y_test, kf_ridge_test_preds)

best_k_fold_model = k_fold_model_finder.best_estimator_.named_steps['ridge']
best_k_fold_coefs = best_k_fold_model.coef_
feature_names = k_fold_model_finder.best_estimator_.named_steps['scaler'].get_feature_names_out()
kf_df = pd.DataFrame({'feature': feature_names, 'coef': best_k_fold_coefs})

print(f"Ridge k-fold cross validation Train MSE: {kf_ridge_train_mse}")
print(f"Ridge k-fold cross validation Test MSE: {kf_ridge_test_mse}")
print(f"Best Ridge Alpha: {k_fold_model_finder.best_params_}")
kf_df

Ridge k-fold cross validation Train MSE: 1.40335249207729
Ridge k-fold cross validation Test MSE: 0.49155892653814764
Best Ridge Alpha: {'ridge__alpha': 1.0}


Unnamed: 0,feature,coef
0,QualityOfLifeQualityOfLife,-1.853776
1,QualityOfLifeAffordability,-1.75462
2,QualityOfLifeEconomy,-1.713104
3,QualityOfLifeEducationAndHealth,-2.763819
4,QualityOfLifeSafety,-3.089087


In [62]:
# leave one out cross validation
loo_model_finder = GridSearchCV(ridge_pipe, param_grid=ridge_param_dict, cv=50, scoring='neg_mean_squared_error')
loo_model_finder.fit(X, y)

loo_ridge_train_preds = loo_model_finder.predict(simple_X_train)
loo_ridge_test_preds = loo_model_finder.predict(simple_X_test)
loo_ridge_train_mse = mean_squared_error(simple_y_train, loo_ridge_train_preds)
loo_ridge_test_mse = mean_squared_error(simple_y_test, loo_ridge_test_preds)

best_loo_model = loo_model_finder.best_estimator_.named_steps['ridge']
best_loo_coefs = best_loo_model.coef_
feature_names = loo_model_finder.best_estimator_.named_steps['scaler'].get_feature_names_out()
loo_df = pd.DataFrame({'feature': feature_names, 'coef': best_loo_coefs})

print(f"Ridge loo cross validation Train MSE: {loo_ridge_train_mse}")
print(f"Ridge loo cross validation Test MSE: {loo_ridge_test_mse}")
print(f"Best Ridge Alpha: {loo_model_finder.best_params_}")
loo_df

Ridge loo cross validation Train MSE: 1.40335249207729
Ridge loo cross validation Test MSE: 0.49155892653814764
Best Ridge Alpha: {'ridge__alpha': 1.0}


Unnamed: 0,feature,coef
0,QualityOfLifeQualityOfLife,-1.853776
1,QualityOfLifeAffordability,-1.75462
2,QualityOfLifeEconomy,-1.713104
3,QualityOfLifeEducationAndHealth,-2.763819
4,QualityOfLifeSafety,-3.089087
