In [228]:
# Core Libraries
import pandas as pd
import numpy as np
import joblib
import logging
import re
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Feature Selection and Permutation Importance
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.inspection import permutation_importance

# Regression Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor

# Classification Models (used in feature importance or metrics evaluation)
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    recall_score,
    f1_score,
    roc_auc_score,
    make_scorer
)

# General Settings
warnings.filterwarnings('ignore')


In [229]:
# Loading data
train_data = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Data/train_features.csv')
train_labels = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Data/train_labels.csv')
test_data = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Data/test_features.csv')


In [230]:
train_data.head()

Unnamed: 0,uid,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,aace,,,,,,,,,,...,2.somewhat important,9.Never,9.Never,0.No,,,,,,Concrete 2
1,aanz,,,,,,,,,,...,1.very important,9.Never,1.Almost every day,0.No,,,,,,Concrete 2
2,aape,,,,,,,,,,...,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"
3,aard,1. 50–59,"1. 100,000+",3. Widowed,1.0,3. 7–9 years,1. 1 or 2,0.0,4. Fair,0.0,...,1.very important,4.Once a week,9.Never,1.Yes,,,,,No 2,Concrete 2
4,ablr,,,,,,,,,,...,1.very important,3.2 or 3 times a week,3.2 or 3 times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"


In [231]:
train_labels.head()

Unnamed: 0,uid,year,composite_score
0,aace,2021,175
1,aanz,2021,206
2,aape,2016,161
3,aape,2021,144
4,aard,2021,104


In [232]:
test_data.head()

Unnamed: 0,uid,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,abxu,,,,,,,,,,...,,,,,,,,,,"Wood, mosaic, or other covering 1"
1,aeol,,,,,,,,,,...,1.very important,9.Never,9.Never,1.Yes,,,,,,Concrete 2
2,afnb,,,,,,,,,,...,1.very important,9.Never,3.2 or 3 times a week,1.Yes,,,,,,"Wood, mosaic, or other covering 1"
3,ajfh,,,,,,,,,,...,2.somewhat important,9.Never,5.4 or more times a month,0.No,,,,,,"Wood, mosaic, or other covering 1"
4,ajvq,2. 60–69,"1. 100,000+",1. Married or in civil union,1.0,4. 10+ years,1. 1 or 2,0.0,,,...,2.somewhat important,1.Almost every day,4.Once a week,0.No,,,,,No 2,"Wood, mosaic, or other covering 1"


In [233]:
# Check the data types of the 'uid' column in each dataframe
print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)


(3276, 184)
(4343, 3)
(819, 184)


In [234]:
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [235]:
# This will add labels only for rows in the training data
combined_data = pd.merge(combined_data, train_labels, on='uid', how='left')

# Check the shape and a few rows to confirm
print("Combined data shape after merging:", combined_data.shape)
print(combined_data.head())

Combined data shape after merging: (5162, 186)
    uid    age_03     urban_03  married_03  n_mar_03    edu_gru_03  \
0  aace       NaN          NaN         NaN       NaN           NaN   
1  aanz       NaN          NaN         NaN       NaN           NaN   
2  aape       NaN          NaN         NaN       NaN           NaN   
3  aape       NaN          NaN         NaN       NaN           NaN   
4  aard  1. 50–59  1. 100,000+  3. Widowed       1.0  3. 7–9 years   

  n_living_child_03  migration_03 glob_hlth_03  adl_dress_03  ...  \
0               NaN           NaN          NaN           NaN  ...   
1               NaN           NaN          NaN           NaN  ...   
2               NaN           NaN          NaN           NaN  ...   
3               NaN           NaN          NaN           NaN  ...   
4         1. 1 or 2           0.0      4. Fair           0.0  ...   

               rsocact_m_12  rrelgwk_12  a16a_12  a21_12  a22_12  a33b_12  \
0                   9.Never        0.No 

In [236]:
# Convert wide to long format
data_long = pd.wide_to_long(
    long_data ,
    stubnames=[
        'age', 'urban', 'married', 'n_mar', 'edu_gru', 'n_living_child', 'migration', 
        'glob_hlth', 'adl_dress', 'adl_walk', 'adl_bath', 'adl_eat', 'adl_bed', 'adl_toilet', 
        'n_adl', 'iadl_money', 'iadl_meds', 'iadl_shop', 'iadl_meals', 'n_iadl', 'depressed', 
        'hard', 'restless', 'happy', 'lonely', 'enjoy', 'sad', 'tired', 'energetic', 'n_depr', 
        'cesd_depressed', 'hypertension', 'diabetes', 'resp_ill', 'arthritis', 'hrt_attack', 
        'stroke', 'cancer', 'n_illnesses', 'bmi', 'exer_3xwk', 'alcohol', 'tobacco', 
        'test_chol', 'test_tuber', 'test_diab', 'test_pres', 'hosp', 'visit_med', 
        'out_proc', 'visit_dental', 'imss', 'issste', 'pem_def_mar', 'insur_private', 
        'insur_other', 'insured', 'decis_famil', 'decis_personal', 'employment', 
        'rjob_hrswk', 'rjlocc_m', 'rjob_end', 'rjobend_reason', 'rearnings', 
        'searnings', 'hincome', 'hinc_business', 'hinc_rent', 'hinc_assets', 'hinc_cap', 
        'rinc_pension', 'sinc_pension', 'rrelgimp', 'rrfcntx_m', 'rsocact_m', 'rrelgwk', 
        'a34', 'j11'
    ],
    i=['uid', 'year', 'composite_score'],  # Use 'uid', 'year', and 'composite_score' as identifiers
    j='time',  # Variable to capture time (03 or 12)
    sep='_',  # Separator used in wide-format column names
    suffix='\\d+'  # Regular expression to capture suffixes (03 or 12)
).reset_index()

# Display the first few rows to verify
data_long.head()


Unnamed: 0,uid,year,composite_score,time,seg_pop_12,comms_tel_comp_12,tv_12,vax_flu_12,table_games_12,satis_excel_12,...,hinc_assets,hinc_cap,rinc_pension,sinc_pension,rrelgimp,rrfcntx_m,rsocact_m,rrelgwk,a34,j11
0,aace,2021.0,175.0,3,1.0,0.0,1.0,0.0,0.0,3.0,...,,,,,,,,,,
1,aace,2021.0,175.0,12,1.0,0.0,1.0,0.0,0.0,3.0,...,0.0,10000.0,0.0,0.0,2.0,9.0,9.0,0.0,,2.0
2,aanz,2021.0,206.0,3,0.0,0.0,1.0,1.0,0.0,2.0,...,,,,,,,,,,
3,aanz,2021.0,206.0,12,0.0,0.0,1.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,1.0,9.0,1.0,0.0,,2.0
4,aape,2016.0,161.0,3,0.0,1.0,1.0,0.0,0.0,1.0,...,,,,,,,,,,


In [237]:
data_long.shape

(10324, 113)

In [238]:
# Printing the missing values sorted in descending order
print(data_long.isna().sum().sort_values(ascending=False).to_list())


[10244, 10208, 10192, 10192, 9036, 8995, 7128, 7040, 6588, 5482, 5470, 5468, 5275, 4952, 4592, 4395, 4395, 3652, 3371, 2270, 2251, 2247, 2247, 2228, 2225, 2223, 2222, 2220, 2216, 2211, 2210, 2210, 2209, 2208, 2207, 2206, 2206, 2205, 2205, 2205, 2205, 2204, 2204, 2204, 1962, 1904, 1884, 1882, 1879, 1874, 1866, 1863, 1859, 1859, 1858, 1851, 1850, 1848, 1847, 1847, 1847, 1844, 1842, 1841, 1841, 1840, 1840, 1840, 1840, 1839, 1839, 1838, 1837, 1837, 1837, 1836, 1836, 1835, 1835, 1835, 1835, 1819, 1819, 1819, 1819, 1638, 1638, 1506, 1250, 840, 790, 722, 704, 692, 658, 642, 640, 640, 624, 622, 620, 618, 618, 616, 616, 616, 612, 612, 612, 266, 0, 0, 0]


In [239]:
# Printing the missing values with column names, sorted in descending order
print(data_long.isna().sum().sort_values(ascending=False))


a16a_12            10244
a22_12             10208
a33b_12            10192
a21_12             10192
rjob_end            9036
                   ...  
attends_club_12      612
seg_pop_12           266
time                   0
ragender               0
uid                    0
Length: 113, dtype: int64


In [240]:
numeric_data = data_long.select_dtypes(include=['float64', 'int64'])
categorical_data = data_long.select_dtypes(exclude=['float64', 'int64'])

In [241]:

missing_percentages = (data_long.isna().sum() / len(data_long)) * 100


high_missing_columns = missing_percentages[missing_percentages > 40].index

data_long = data_long.drop(columns=high_missing_columns)


print("Columns remaining after dropping those with more than 40% missing values:")
print(data_long.columns)


Columns remaining after dropping those with more than 40% missing values:
Index(['uid', 'year', 'composite_score', 'time', 'seg_pop_12',
       'comms_tel_comp_12', 'tv_12', 'vax_flu_12', 'table_games_12',
       'satis_excel_12', 'volunteer_12', 'wouldnt_change_12',
       'attends_club_12', 'reads_12', 'ragender', 'care_child_12',
       'satis_ideal_12', 'games_12', 'satis_fine_12', 'attends_class_12',
       'act_mant_12', 'cosas_imp_12', 'vax_pneu_12', 'memory_12', 'sewing_12',
       'sgender_12', 'care_adult_12', 'rameduc_m', 'rafeduc_m', 'age', 'urban',
       'married', 'n_mar', 'edu_gru', 'n_living_child', 'migration',
       'glob_hlth', 'adl_dress', 'adl_walk', 'adl_bath', 'adl_eat', 'adl_bed',
       'adl_toilet', 'n_adl', 'iadl_money', 'iadl_meds', 'iadl_shop',
       'iadl_meals', 'n_iadl', 'depressed', 'hard', 'restless', 'happy',
       'lonely', 'enjoy', 'sad', 'tired', 'energetic', 'n_depr',
       'cesd_depressed', 'hypertension', 'diabetes', 'resp_ill', 'arthritis'

In [242]:

imputer_numeric = SimpleImputer(strategy='median')
imputer_categorical = SimpleImputer(strategy='most_frequent')


numeric_data_imputed = pd.DataFrame(imputer_numeric.fit_transform(numeric_data), columns=numeric_data.columns)
categorical_data_imputed = pd.DataFrame(imputer_categorical.fit_transform(categorical_data), columns=categorical_data.columns)


data_long_imputed = pd.concat([numeric_data_imputed, categorical_data_imputed], axis=1)


print("Number of missing values after imputation:")


Number of missing values after imputation:


In [243]:
print(data_long_imputed.isna().sum().sum())

0


In [244]:
data_long_imputed.head()

Unnamed: 0,year,composite_score,time,seg_pop_12,comms_tel_comp_12,tv_12,vax_flu_12,table_games_12,satis_excel_12,volunteer_12,...,hinc_cap,rinc_pension,sinc_pension,rrelgimp,rrfcntx_m,rsocact_m,rrelgwk,a34,j11,uid
0,2021.0,175.0,3.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,1.0,4.0,8.0,0.0,2.0,1.0,aace
1,2021.0,175.0,12.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,...,10000.0,0.0,0.0,2.0,9.0,9.0,0.0,2.0,2.0,aace
2,2021.0,206.0,3.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,4.0,8.0,0.0,2.0,1.0,aanz
3,2021.0,206.0,12.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,9.0,1.0,0.0,2.0,2.0,aanz
4,2016.0,161.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,4.0,8.0,0.0,2.0,1.0,aape


In [245]:
data_long_imputed.dtypes.unique()

array([dtype('float64'), dtype('O')], dtype=object)

In [246]:
data_long_imputed.drop(columns = 'uid', inplace=True)

In [247]:

unique_values = data_long_imputed.apply(lambda x: x.unique())


for col, unique_vals in unique_values.items():
    print(f"Column: {col} - Unique values: {unique_vals}")


Column: year - Unique values: [2021. 2016.]
Column: composite_score - Unique values: [175. 206. 161. 144. 104. 183. 106. 152.  13. 193.  38. 272. 254.  87.
  92. 203. 117.  84.  17.  48.  93. 140.  69. 218.  78. 122. 157. 162.
  40. 126. 178. 137.  76. 202.  86. 133. 225. 131. 231. 154.  71.  64.
 118. 151. 127. 163. 179. 198. 108. 119.  91. 107. 174. 120.  66.  56.
 246. 195. 281. 289. 159. 121. 200. 248. 258. 191. 209.  39. 267. 247.
 150. 223. 226. 205. 158. 185.  60. 146. 167.  63. 204.  52. 181. 220.
 215. 229.  97. 234. 232. 115.  54. 103. 186. 256.  99. 171.  51.  47.
 124. 147.  68. 130. 136. 148. 311. 134. 153. 100. 221. 199.  14.  42.
 143.  25. 164.  74.  20. 216.   4. 235. 141. 264. 269. 173. 155. 125.
 165. 194. 214. 277.  65. 184. 212. 210. 102.  53.  21.  57.  89.  80.
 101. 245. 236.  26. 105. 251.  88. 197.  95. 213. 109. 169. 189.  50.
  45. 240. 135. 177. 252. 156. 145. 176. 149. 116. 170.  98. 201. 237.
 129. 139. 196.  90. 211. 142.  75.  72.  22. 166. 113.  30. 18

In [248]:
X = data_long_imputed.drop(columns=['composite_score'])  # Drop target and any non-feature columns
y = data_long_imputed['composite_score']

In [250]:
from sklearn.preprocessing import MinMaxScaler

# Scale data using MinMaxScaler to keep everything in the range [0, 1] for chi2
min_max_scaler = MinMaxScaler()
X_min_max_scaled = min_max_scaler.fit_transform(X)

# Apply SelectKBest with chi2
chi2_selector = SelectKBest(chi2, k=10)
X_chi2 = chi2_selector.fit_transform(X_min_max_scaled, y)
chi2_selected_features = X.columns[chi2_selector.get_support()].tolist()

# Apply SelectKBest with mutual_info_regression on the original scaled data
mi_selector = SelectKBest(mutual_info_regression, k=10)
X_mi = mi_selector.fit_transform(X_scaled, y)
mi_selected_features = X.columns[mi_selector.get_support()].tolist()

# Apply SelectKBest with f_regression on the original scaled data
f_selector = SelectKBest(f_regression, k=10)
X_f = f_selector.fit_transform(X_scaled, y)
f_selected_features = X.columns[f_selector.get_support()].tolist()

# Combine selected features
all_selected_features = chi2_selected_features + mi_selected_features + f_selected_features
feature_counts = Counter(all_selected_features)
most_common_features = [feature for feature, count in feature_counts.most_common(10)]

print("Most Common 10 Selected Features:", most_common_features)

# Use these features for further modeling
X_selected = X[most_common_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


Most Common 10 Selected Features: ['table_games_12', 'reads_12', 'games_12', 'rameduc_m', 'edu_gru', 'rafeduc_m', 'n_living_child', 'seg_pop_12', 'attends_club_12', 'care_child_12']


In [251]:
# Use these most common features for further modeling
X_selected = X[most_common_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [252]:
X_train

Unnamed: 0,table_games_12,reads_12,games_12,rameduc_m,edu_gru,rafeduc_m,n_living_child,seg_pop_12,attends_club_12,care_child_12
3381,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
10195,0.0,0.0,0.0,2.0,4.0,2.0,1.0,0.0,0.0,1.0
5065,0.0,1.0,1.0,1.0,0.0,4.0,2.0,0.0,0.0,0.0
8398,0.0,1.0,0.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0
828,1.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
5734,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0
5191,0.0,1.0,0.0,1.0,2.0,2.0,2.0,0.0,0.0,1.0
5390,0.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0
860,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,0.0,0.0


In [253]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from prophet import Prophet

In [257]:
models_new_results = []

In [258]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)

lr_results = {
    'Model': 'Linear Regression',
    'MAE': mean_absolute_error(y_test, lr_y_pred),
    'MSE': mean_squared_error(y_test, lr_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lr_y_pred)),
    'R-squared': r2_score(y_test, lr_y_pred)
}
models_new_results.append(lr_results)

In [259]:
# Ridge Regression
ridge_model = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 10]}
ridge_grid_search = GridSearchCV(ridge_model, ridge_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
ridge_best_model = ridge_grid_search.best_estimator_
ridge_y_pred = ridge_best_model.predict(X_test)

ridge_results = {
    'Model': 'Ridge',
    'Best Parameters': ridge_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, ridge_y_pred),
    'MSE': mean_squared_error(y_test, ridge_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, ridge_y_pred)),
    'R-squared': r2_score(y_test, ridge_y_pred)
}
models_new_results.append(ridge_results)

In [260]:
# Lasso Regression
lasso_model = Lasso()
lasso_param_grid = {'alpha': [0.1, 1.0, 10]}
lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
lasso_best_model = lasso_grid_search.best_estimator_
lasso_y_pred = lasso_best_model.predict(X_test)

lasso_results = {
    'Model': 'Lasso',
    'Best Parameters': lasso_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, lasso_y_pred),
    'MSE': mean_squared_error(y_test, lasso_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lasso_y_pred)),
    'R-squared': r2_score(y_test, lasso_y_pred)
}
models_new_results.append(lasso_results)

In [261]:
# ElasticNet Regression
en_model = ElasticNet()
en_param_grid = {'alpha': [0.1, 1.0, 10], 'l1_ratio': [0.2, 0.5, 0.8]}
en_grid_search = GridSearchCV(en_model, en_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
en_grid_search.fit(X_train, y_train)
en_best_model = en_grid_search.best_estimator_
en_y_pred = en_best_model.predict(X_test)

en_results = {
    'Model': 'ElasticNet',
    'Best Parameters': en_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, en_y_pred),
    'MSE': mean_squared_error(y_test, en_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, en_y_pred)),
    'R-squared': r2_score(y_test, en_y_pred)
}
models_new_results.append(en_results)

In [262]:
# DecisionTree Regressor
dt_model = DecisionTreeRegressor()
dt_param_grid = {'max_depth': [5, 10, None], 'min_samples_split': [2, 5]}
dt_grid_search = GridSearchCV(dt_model, dt_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
dt_best_model = dt_grid_search.best_estimator_
dt_y_pred = dt_best_model.predict(X_test)

dt_results = {
    'Model': 'DecisionTree',
    'Best Parameters': dt_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, dt_y_pred),
    'MSE': mean_squared_error(y_test, dt_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, dt_y_pred)),
    'R-squared': r2_score(y_test, dt_y_pred)
}
models_new_results.append(dt_results)

In [263]:
# Random Forest Regressor
rf_model = RandomForestRegressor()
rf_param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)

rf_results = {
    'Model': 'Random Forest',
    'Best Parameters': rf_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'MSE': mean_squared_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R-squared': r2_score(y_test, rf_y_pred)
}
models_new_results.append(rf_results)

In [264]:
# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor()
gb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_y_pred = gb_best_model.predict(X_test)

gb_results = {
    'Model': 'Gradient Boosting',
    'Best Parameters': gb_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, gb_y_pred),
    'MSE': mean_squared_error(y_test, gb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, gb_y_pred)),
    'R-squared': r2_score(y_test, gb_y_pred)
}
models_new_results.append(gb_results)

In [265]:
# XGBoost Regression
xgb_model = XGBRegressor()
xgb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_y_pred = xgb_best_model.predict(X_test)

xgb_results = {
    'Model': 'XGBoost',
    'Best Parameters': xgb_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'MSE': mean_squared_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R-squared': r2_score(y_test, xgb_y_pred)
}
models_new_results.append(xgb_results)

In [266]:
from lightgbm import LGBMRegressor

In [267]:
# LightGBM Regression
lgbm_model = LGBMRegressor()
lgbm_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lgbm_grid_search.fit(X_train, y_train)
lgbm_best_model = lgbm_grid_search.best_estimator_
lgbm_y_pred = lgbm_best_model.predict(X_test)

lgbm_results = {
    'Model': 'LightGBM',
    'Best Parameters': lgbm_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, lgbm_y_pred),
    'MSE': mean_squared_error(y_test, lgbm_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lgbm_y_pred)),
    'R-squared': r2_score(y_test, lgbm_y_pred)
}
models_new_results.append(lgbm_results)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32
[LightGBM] [Info] Number of data points in the train set: 8259, number of used features: 10
[LightGBM] [Info] Start training from score 157.385882


In [268]:
# CatBoost Regression
catboost_model = CatBoostRegressor(verbose=0)
catboost_model.fit(X_train, y_train)
catboost_y_pred = catboost_model.predict(X_test)

catboost_results = {
    'Model': 'CatBoost',
    'MAE': mean_absolute_error(y_test, catboost_y_pred),
    'MSE': mean_squared_error(y_test, catboost_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, catboost_y_pred)),
    'R-squared': r2_score(y_test, catboost_y_pred)
}
models_new_results.append(catboost_results)

In [269]:
models_new_results

[{'Model': 'Linear Regression',
  'MAE': 36.40066610480813,
  'MSE': 2151.8418088118774,
  'RMSE': 46.38794896103812,
  'R-squared': 0.2904175594984998},
 {'Model': 'Ridge',
  'Best Parameters': {'alpha': 1.0},
  'MAE': 36.40029530992575,
  'MSE': 2151.815732068773,
  'RMSE': 46.38766788779937,
  'R-squared': 0.2904261584572786},
 {'Model': 'Lasso',
  'Best Parameters': {'alpha': 0.1},
  'MAE': 36.37212357596654,
  'MSE': 2149.6960342173643,
  'RMSE': 46.364814614288754,
  'R-squared': 0.2911251412395486},
 {'Model': 'ElasticNet',
  'Best Parameters': {'alpha': 0.1, 'l1_ratio': 0.8},
  'MAE': 36.3460782078874,
  'MSE': 2148.275438201238,
  'RMSE': 46.34949231870009,
  'R-squared': 0.29159359109676475},
 {'Model': 'DecisionTree',
  'Best Parameters': {'max_depth': 5, 'min_samples_split': 5},
  'MAE': 36.24035317697644,
  'MSE': 2130.2171551432352,
  'RMSE': 46.154275588976965,
  'R-squared': 0.29754841570844914},
 {'Model': 'Random Forest',
  'Best Parameters': {'max_depth': 10, 'n_esti

In [274]:
import pandas as pd

# Ensure models_new_results is a DataFrame
models_new_results = pd.DataFrame(models_new_results)  # If models_new_results was initially a list of dictionaries


In [275]:
# Group by model name and select the row with the best R-squared or lowest MAE
best_results = models_new_results.loc[models_new_results.groupby('Model')['RMSE'].idxmax()]

print(best_results)


               Model        MAE          MSE       RMSE  R-squared  \
9           CatBoost  33.418887  1830.733376  42.787070   0.396305   
4       DecisionTree  36.240353  2130.217155  46.154276   0.297548   
3         ElasticNet  36.346078  2148.275438  46.349492   0.291594   
6  Gradient Boosting  35.009639  1959.449813  44.265673   0.353860   
2              Lasso  36.372124  2149.696034  46.364815   0.291125   
8           LightGBM  34.277284  1887.856739  43.449473   0.377468   
0  Linear Regression  36.400666  2151.841809  46.387949   0.290418   
5      Random Forest  33.505940  1840.551190  42.901646   0.393067   
1              Ridge  36.400295  2151.815732  46.387668   0.290426   
7            XGBoost  34.194670  1884.313428  43.408679   0.378637   

                                Best Parameters  
9                                           NaN  
4      {'max_depth': 5, 'min_samples_split': 5}  
3               {'alpha': 0.1, 'l1_ratio': 0.8}  
6   {'learning_rate': 0.1, 'n

In [276]:
import pickle

In [278]:


# Save the trained model to a file
with open('new_ridge_model.pkl', 'wb') as file:
    pickle.dump(ridge_best_model, file)


In [279]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

In [280]:
X_train.columns

Index(['table_games_12', 'reads_12', 'games_12', 'rameduc_m', 'edu_gru',
       'rafeduc_m', 'n_living_child', 'seg_pop_12', 'attends_club_12',
       'care_child_12'],
      dtype='object')

In [281]:
feature_names = list(ridge_best_model.feature_names_in_)
print(feature_names)


['table_games_12', 'reads_12', 'games_12', 'rameduc_m', 'edu_gru', 'rafeduc_m', 'n_living_child', 'seg_pop_12', 'attends_club_12', 'care_child_12']
