In [2]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(display="diagram")


import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

Matplotlib is building the font cache; this may take a moment.


In [3]:
# Load the data
states = pd.read_csv("data/qol_states_2024.csv")
states.head()

Unnamed: 0,state,QualityOfLifeTotalScore,QualityOfLifeQualityOfLife,QualityOfLifeAffordability,QualityOfLifeEconomy,QualityOfLifeEducationAndHealth,QualityOfLifeSafety
0,Alabama,45.61,40,1,40,48,32
1,Alaska,40.93,50,42,22,30,45
2,Arizona,48.31,21,25,14,39,40
3,Arkansas,42.42,46,4,34,45,47
4,California,52.03,2,50,15,24,27


In [4]:
# simple cross validation
simple_X = states.drop(columns=['state', 'QualityOfLifeTotalScore'], axis = 1)
simple_y = states['QualityOfLifeTotalScore']
simple_X_train, simple_X_test, simple_y_train, simple_y_test = train_test_split(simple_X, simple_y, test_size=0.3, random_state=42)


In [24]:
from sklearn.linear_model import Ridge

# ridge model with standardization
ridge_param_dict = {'ridge__alpha': np.logspace(0, 10, 50)}
ridge_pipe = Pipeline([('scaler', StandardScaler()), 
                      ('ridge', Ridge())])
ridge_grid = GridSearchCV(ridge_pipe, param_grid=ridge_param_dict)
ridge_grid.fit(simple_X_train, simple_y_train)
ridge_train_preds = ridge_grid.predict(simple_X_train)
ridge_test_preds = ridge_grid.predict(simple_X_test)
ridge_train_mse = mean_squared_error(simple_y_train, ridge_train_preds)
ridge_test_mse = mean_squared_error(simple_y_test, ridge_test_preds)

best_ridge_coefs = ridge_grid.best_estimator_.named_steps['ridge'].coef_
feature_names = ridge_grid.best_estimator_.named_steps['scaler'].get_feature_names_out()
ridge_df = pd.DataFrame({'feature': feature_names, 'coef': best_ridge_coefs})

print(f"Ridge simple cross validation Train MSE: {ridge_train_mse}")
print(f"Ridge simple cross validation Test MSE: {ridge_test_mse}")
print(f"Best Ridge Alpha: {ridge_grid.best_params_}")
ridge_df

Ridge simple cross validation Train MSE: 1.3942594497108203
Ridge simple cross validation Test MSE: 0.5383255811786083
Best Ridge Alpha: {'ridge__alpha': 1.0}


Unnamed: 0,feature,coef
0,QualityOfLifeQualityOfLife,-1.895126
1,QualityOfLifeAffordability,-1.821392
2,QualityOfLifeEconomy,-1.791605
3,QualityOfLifeEducationAndHealth,-2.783066
4,QualityOfLifeSafety,-3.151602


In [None]:
# k-fold cross validation
