In [2]:
# Core Libraries
import pandas as pd
import numpy as np
import joblib
import logging
import re
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Feature Selection and Permutation Importance
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.inspection import permutation_importance

# Regression Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor

# Classification Models (used in feature importance or metrics evaluation)
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    recall_score,
    f1_score,
    roc_auc_score,
    make_scorer
)

# General Settings
warnings.filterwarnings('ignore')


In [3]:
# Loading data
train_data = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Original data/train_features.csv')
train_labels = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Original data/train_labels.csv')
test_data = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Original data/test_features.csv')


In [4]:
train_data.head()

Unnamed: 0,uid,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,aace,,,,,,,,,,...,2.somewhat important,9.Never,9.Never,0.No,,,,,,Concrete 2
1,aanz,,,,,,,,,,...,1.very important,9.Never,1.Almost every day,0.No,,,,,,Concrete 2
2,aape,,,,,,,,,,...,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"
3,aard,1. 50–59,"1. 100,000+",3. Widowed,1.0,3. 7–9 years,1. 1 or 2,0.0,4. Fair,0.0,...,1.very important,4.Once a week,9.Never,1.Yes,,,,,No 2,Concrete 2
4,ablr,,,,,,,,,,...,1.very important,3.2 or 3 times a week,3.2 or 3 times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"


In [5]:
train_labels.head()

Unnamed: 0,uid,year,composite_score
0,aace,2021,175
1,aanz,2021,206
2,aape,2016,161
3,aape,2021,144
4,aard,2021,104


In [6]:
test_data.head()

Unnamed: 0,uid,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,abxu,,,,,,,,,,...,,,,,,,,,,"Wood, mosaic, or other covering 1"
1,aeol,,,,,,,,,,...,1.very important,9.Never,9.Never,1.Yes,,,,,,Concrete 2
2,afnb,,,,,,,,,,...,1.very important,9.Never,3.2 or 3 times a week,1.Yes,,,,,,"Wood, mosaic, or other covering 1"
3,ajfh,,,,,,,,,,...,2.somewhat important,9.Never,5.4 or more times a month,0.No,,,,,,"Wood, mosaic, or other covering 1"
4,ajvq,2. 60–69,"1. 100,000+",1. Married or in civil union,1.0,4. 10+ years,1. 1 or 2,0.0,,,...,2.somewhat important,1.Almost every day,4.Once a week,0.No,,,,,No 2,"Wood, mosaic, or other covering 1"


In [7]:
# Check the data types of the 'uid' column in each dataframe
print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)


(3276, 184)
(4343, 3)
(819, 184)


In [8]:
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [9]:
# This will add labels only for rows in the training data
combined_data = pd.merge(combined_data, train_labels, on='uid', how='left')

# Check the shape and a few rows to confirm
print("Combined data shape after merging:", combined_data.shape)
print(combined_data.head())

Combined data shape after merging: (5162, 186)
    uid    age_03     urban_03  married_03  n_mar_03    edu_gru_03  \
0  aace       NaN          NaN         NaN       NaN           NaN   
1  aanz       NaN          NaN         NaN       NaN           NaN   
2  aape       NaN          NaN         NaN       NaN           NaN   
3  aape       NaN          NaN         NaN       NaN           NaN   
4  aard  1. 50–59  1. 100,000+  3. Widowed       1.0  3. 7–9 years   

  n_living_child_03  migration_03 glob_hlth_03  adl_dress_03  ...  \
0               NaN           NaN          NaN           NaN  ...   
1               NaN           NaN          NaN           NaN  ...   
2               NaN           NaN          NaN           NaN  ...   
3               NaN           NaN          NaN           NaN  ...   
4         1. 1 or 2           0.0      4. Fair           0.0  ...   

               rsocact_m_12  rrelgwk_12  a16a_12  a21_12  a22_12  a33b_12  \
0                   9.Never        0.No 

In [10]:
def extract_numeric(data, columns, position='first'):
    """Extract numeric values from specified columns of a DataFrame.
    
    Args:
        data (pd.DataFrame): The DataFrame to clean.
        columns (list): List of column names to clean.
        position (str): 'first' to extract the first numeric value, 'last' for the last.
    """
    for col in columns:
        # Checking for numeric content
        if combined_data[col].str.contains(r'\d').any():  # Check if there's at least one numeric character
            if position == 'first':
                # Extracting the first numeric part using regex
                combined_data[col] = combined_data[col].astype(str).str.extract(r'(\d+)')[0]
            elif position == 'last':
                # Extracting the last numeric part using regex
                combined_data[col] = combined_data[col].astype(str).str.extract(r'(\d+)(?!.*\d)')[0]
            # Converting to float
            combined_data[col] = combined_data[col].astype(float)

# Selecting all object-type columns for cleaning
columns_to_clean = combined_data.select_dtypes(include=['object']).columns

# Extracting numeric values based on the specified position ('first' or 'last')
extract_numeric(combined_data, columns_to_clean, position='first')  # Change to 'last' if needed

# Displaying the cleaned columns
print(combined_data[columns_to_clean].head(10))  # Printing the first few rows of the cleaned columns



    uid  age_03  urban_03  married_03  edu_gru_03  n_living_child_03  \
0  aace     NaN       NaN         NaN         NaN                NaN   
1  aanz     NaN       NaN         NaN         NaN                NaN   
2  aape     NaN       NaN         NaN         NaN                NaN   
3  aape     NaN       NaN         NaN         NaN                NaN   
4  aard     1.0       1.0         3.0         3.0                1.0   
5  ablr     NaN       NaN         NaN         NaN                NaN   
6  abme     1.0       0.0         1.0         1.0                3.0   
7  abrn     1.0       0.0         1.0         0.0                3.0   
8  acet     1.0       1.0         1.0         3.0                0.0   
9  acgx     NaN       NaN         NaN         NaN                NaN   

   glob_hlth_03  bmi_03  decis_famil_03  employment_03  ...  rjlocc_m_12  \
0           NaN     NaN             NaN            NaN  ...          6.0   
1           NaN     NaN             NaN            NaN 

In [11]:
# Print the first 10 rows of combined_data as a list of lists
print(combined_data.head(10).values.tolist())


[['aace', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 4.0, 0.0, nan, nan, nan, nan, nan, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 3.0, 3.0, 1.0, 1.0, 3.0, 2.0, 1.0, 1.0, 1.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2.0, 66.0, 6.0, nan, nan, 0.0, 0.0, 140000.0, 10000.0, 0.0, 0.0, 10000.0, 0.0, 0.0, 2.0, 9.0, 9.0, 0.0, nan, nan, nan, nan, nan, 2.0, 2021.0, 175.0], ['aanz', nan, nan, nan, nan, nan, nan, nan, n

In [12]:
combined_data.shape

(5162, 186)

In [13]:
long_data = combined_data.copy()

In [14]:

# Dictionary of feature categories
feature_groups = {
    'categorical_nominal': [
        'ragender',  # Gender
        'sgender_03', 'sgender_12',  # Spouse gender
        'urban_03', 'urban_12',  # Rural/Urban
        'married_03', 'married_12',  # Marital status
        'rjlocc_m_03', 'rjlocc_m_12',  # Occupation category
        'rjobend_reason_03', 'rjobend_reason_12',  # Job end reason
        'a22_12',  # US job type
        'a33b_12',  # US residency status
        'j11_12',  # Floor material
    ],
    
    'categorical_ordinal': [
        'age_03', 'age_12',  # Age groups
        'edu_gru_03', 'edu_gru_12',  # Education level
        'n_living_child_03', 'n_living_child_12',  # Number of children
        'glob_hlth_03', 'glob_hlth_12',  # Self-reported health
        'bmi_03', 'bmi_12',  # BMI categories
        'decis_famil_03', 'decis_famil_12',  # Family decision weight
        'decis_personal_03', 'decis_personal_12',  # Personal decision weight
        'employment_03', 'employment_12',  # Employment status
        'rameduc_m', 'rafeduc_m',  # Parents' education
        'rrelgimp_03', 'rrelgimp_12',  # Religion importance
        'rrfcntx_m_12',  # Friend contact frequency
        'rsocact_m_12',  # Social activity frequency
        'rrelgwk_12',  # Religious service participation
        'a34_12',  # English proficiency
        'memory_12',  # Self-reported memory
        'satis_ideal_12', 'satis_excel_12', 'satis_fine_12',
        'cosas_imp_12', 'wouldnt_change_12'  # Life satisfaction measures
    ],
    
    'binary_indicators': [
        'migration_03', 'migration_12',  # US migration
        'adl_dress_03', 'adl_dress_12',  # ADL limitations
        'adl_walk_03', 'adl_walk_12',
        'adl_bath_03', 'adl_bath_12',
        'adl_eat_03', 'adl_eat_12',
        'adl_bed_03', 'adl_bed_12',
        'adl_toilet_03', 'adl_toilet_12',
        'iadl_money_03', 'iadl_money_12',  # IADL limitations
        'iadl_meds_03', 'iadl_meds_12',
        'iadl_shop_03', 'iadl_shop_12',
        'iadl_meals_03', 'iadl_meals_12',
        'depressed_03', 'depressed_12',  # Mental health indicators
        'hard_03', 'hard_12',
        'restless_03', 'restless_12',
        'happy_03', 'happy_12',
        'lonely_03', 'lonely_12',
        'enjoy_03', 'enjoy_12',
        'sad_03', 'sad_12',
        'tired_03', 'tired_12',
        'energetic_03', 'energetic_12',
        'cesd_depressed_03', 'cesd_depressed_12',
        # Health conditions
        'hypertension_03', 'hypertension_12',
        'diabetes_03', 'diabetes_12',
        'resp_ill_03', 'resp_ill_12',
        'arthritis_03', 'arthritis_12',
        'hrt_attack_03', 'hrt_attack_12',
        'stroke_03', 'stroke_12',
        'cancer_03', 'cancer_12',
        # Health behaviors
        'exer_3xwk_03', 'exer_3xwk_12',
        'alcohol_03', 'alcohol_12',
        'tobacco_03', 'tobacco_12',
        # Healthcare utilization
        'test_chol_03', 'test_chol_12',
        'test_tuber_03', 'test_tuber_12',
        'test_diab_03', 'test_diab_12',
        'test_pres_03', 'test_pres_12',
        'hosp_03', 'hosp_12',
        'visit_med_03', 'visit_med_12',
        'out_proc_03', 'out_proc_12',
        'visit_dental_03', 'visit_dental_12',
        # Insurance coverage
        'imss_03', 'imss_12',
        'issste_03', 'issste_12',
        'pem_def_mar_03', 'pem_def_mar_12',
        'insur_private_03', 'insur_private_12',
        'insur_other_03', 'insur_other_12',
        'seg_pop_12',
        'insured_03', 'insured_12',
        # Activities
        'care_adult_12', 'care_child_12',
        'volunteer_12', 'attends_class_12',
        'attends_club_12', 'reads_12',
        'games_12', 'table_games_12',
        'comms_tel_comp_12', 'act_mant_12',
        'tv_12', 'sewing_12'
    ],
    
    'numerical_continuous': [
        'n_mar_03', 'n_mar_12',  # Number of marriages
        'n_adl_03', 'n_adl_12',  # Number of ADL limitations
        'n_iadl_03', 'n_iadl_12',  # Number of IADL limitations
        'n_depr_03', 'n_depr_12',  # Number of depression symptoms
        'n_illnesses_03', 'n_illnesses_12',  # Number of illnesses
        'rjob_hrswk_03', 'rjob_hrswk_12',  # Working hours
        'rjob_end_03', 'rjob_end_12',  # Job end year
        # Income and earnings
        'rearnings_03', 'rearnings_12',
        'searnings_03', 'searnings_12',
        'hincome_03', 'hincome_12',
        'hinc_business_03', 'hinc_business_12',
        'hinc_rent_03', 'hinc_rent_12',
        'hinc_assets_03', 'hinc_assets_12',
        'hinc_cap_03', 'hinc_cap_12',
        'rinc_pension_03', 'rinc_pension_12',
        'sinc_pension_03', 'sinc_pension_12',
        'a16a_12',  # Year first left for US
        'a21_12'  # Total years in US
    ],
    
    'identifier': ['uid']  # Unique identifier
}


In [15]:
# Create a copy of the DataFrame to avoid modifying the original data
combined_data_cleaned = combined_data.copy()

# Calculate the threshold for dropping features
threshold = 0.4 * len(combined_data_cleaned)  # 40% of the total rows

# Identify columns to drop (those with more than 40% missing values)
cols_to_drop = combined_data_cleaned.columns[combined_data_cleaned.isnull().sum() > threshold]

# Drop the identified columns
combined_data_cleaned.drop(columns=cols_to_drop, inplace=True)

# Print the columns that were dropped
print("Dropped columns with more than 40% missing values:", cols_to_drop.tolist())

# Handling NaN values based on variable types after dropping columns
for col in combined_data_cleaned.columns:
    if col in feature_groups['numerical_continuous']:
        # Fill numerical variables with the mean
        combined_data_cleaned[col].fillna(combined_data_cleaned[col].mean(), inplace=True)

    elif col in feature_groups['categorical_ordinal'] or col in feature_groups['categorical_nominal']:
        # For both ordinal and nominal variables, use backward fill
        combined_data_cleaned[col].bfill(inplace=True)

    elif col in feature_groups['binary_indicators']:
        # For binary indicators, fill with the mode
        combined_data_cleaned[col].fillna(combined_data_cleaned[col].mode()[0], inplace=True)

# Optionally, check for remaining NaN values
remaining_nans = combined_data_cleaned.isnull().sum()
print("Remaining NaN values after filling:", remaining_nans[remaining_nans > 0])  # Print remaining columns with NaN values

# Display the cleaned data
print(combined_data_cleaned.head())


Dropped columns with more than 40% missing values: ['bmi_03', 'decis_famil_03', 'sgender_03', 'rjob_hrswk_03', 'rjlocc_m_03', 'rjob_end_03', 'rjobend_reason_03', 'searnings_03', 'sinc_pension_03', 'rjob_hrswk_12', 'rjlocc_m_12', 'rjob_end_12', 'rjobend_reason_12', 'a16a_12', 'a21_12', 'a22_12', 'a33b_12']
Remaining NaN values after filling: vax_flu_12         320
vax_pneu_12        420
rafeduc_m            1
year               819
composite_score    819
dtype: int64
    uid  age_03  urban_03  married_03  n_mar_03  edu_gru_03  \
0  aace     1.0       1.0         3.0  1.133275         3.0   
1  aanz     1.0       1.0         3.0  1.133275         3.0   
2  aape     1.0       1.0         3.0  1.133275         3.0   
3  aape     1.0       1.0         3.0  1.133275         3.0   
4  aard     1.0       1.0         3.0  1.000000         3.0   

   n_living_child_03  migration_03  glob_hlth_03  adl_dress_03  ...  \
0                1.0           0.0           4.0           0.0  ...   
1       

In [16]:
print(combined_data_cleaned.head(10))

    uid  age_03  urban_03  married_03  n_mar_03  edu_gru_03  \
0  aace     1.0       1.0         3.0  1.133275         3.0   
1  aanz     1.0       1.0         3.0  1.133275         3.0   
2  aape     1.0       1.0         3.0  1.133275         3.0   
3  aape     1.0       1.0         3.0  1.133275         3.0   
4  aard     1.0       1.0         3.0  1.000000         3.0   
5  ablr     1.0       0.0         1.0  1.133275         1.0   
6  abme     1.0       0.0         1.0  1.000000         1.0   
7  abrn     1.0       0.0         1.0  2.000000         0.0   
8  acet     1.0       1.0         1.0  1.000000         3.0   
9  acgx     3.0       0.0         1.0  1.133275         0.0   

   n_living_child_03  migration_03  glob_hlth_03  adl_dress_03  ...  \
0                1.0           0.0           4.0           0.0  ...   
1                1.0           0.0           4.0           0.0  ...   
2                1.0           0.0           4.0           0.0  ...   
3                1.0  

In [17]:
combined_data_cleaned.dropna(inplace=True)

In [18]:
print((combined_data_cleaned.isnull().sum()).sort_values(ascending=False))

uid                0
decis_famil_12     0
out_proc_12        0
visit_dental_12    0
imss_12            0
                  ..
employment_03      0
age_12             0
urban_12           0
married_12         0
composite_score    0
Length: 169, dtype: int64


In [19]:
combined_data_cleaned = combined_data_cleaned.drop(columns='uid',axis=1)

In [20]:
print((combined_data_cleaned.columns).to_list())

['age_03', 'urban_03', 'married_03', 'n_mar_03', 'edu_gru_03', 'n_living_child_03', 'migration_03', 'glob_hlth_03', 'adl_dress_03', 'adl_walk_03', 'adl_bath_03', 'adl_eat_03', 'adl_bed_03', 'adl_toilet_03', 'n_adl_03', 'iadl_money_03', 'iadl_meds_03', 'iadl_shop_03', 'iadl_meals_03', 'n_iadl_03', 'depressed_03', 'hard_03', 'restless_03', 'happy_03', 'lonely_03', 'enjoy_03', 'sad_03', 'tired_03', 'energetic_03', 'n_depr_03', 'cesd_depressed_03', 'hypertension_03', 'diabetes_03', 'resp_ill_03', 'arthritis_03', 'hrt_attack_03', 'stroke_03', 'cancer_03', 'n_illnesses_03', 'exer_3xwk_03', 'alcohol_03', 'tobacco_03', 'test_chol_03', 'test_tuber_03', 'test_diab_03', 'test_pres_03', 'hosp_03', 'visit_med_03', 'out_proc_03', 'visit_dental_03', 'imss_03', 'issste_03', 'pem_def_mar_03', 'insur_private_03', 'insur_other_03', 'insured_03', 'decis_personal_03', 'employment_03', 'age_12', 'urban_12', 'married_12', 'n_mar_12', 'edu_gru_12', 'n_living_child_12', 'migration_12', 'glob_hlth_12', 'adl_dre

In [21]:
# Calculate the correlation matrix
correlation_matrix = combined_data_cleaned.corr()

# Count correlations greater than 0.6 or less than -0.6
high_correlations_count = ((correlation_matrix > 0.6) | (correlation_matrix < -0.6)).sum().sum()

print(f"Total correlations greater than 0.6 or less than -0.6: {high_correlations_count}")

# Filter the correlation matrix
filtered_corr = correlation_matrix[(correlation_matrix > 0.6) | (correlation_matrix < -0.6)]

# Drop rows and columns that are entirely NaN
filtered_corr_cleaned = filtered_corr.dropna(axis=0, how='all').dropna(axis=1, how='all')

# Display the cleaned filtered correlation matrix
print("Filtered Correlation Matrix (values > 0.6 or < -0.6):")
print(filtered_corr_cleaned)




Total correlations greater than 0.6 or less than -0.6: 276
Filtered Correlation Matrix (values > 0.6 or < -0.6):
                 age_03  urban_03  married_03  n_mar_03  edu_gru_03  \
age_03              1.0       NaN         NaN       NaN         NaN   
urban_03            NaN       1.0         NaN       NaN         NaN   
married_03          NaN       NaN         1.0       NaN         NaN   
n_mar_03            NaN       NaN         NaN       1.0         NaN   
edu_gru_03          NaN       NaN         NaN       NaN         1.0   
...                 ...       ...         ...       ...         ...   
rrelgwk_12          NaN       NaN         NaN       NaN         NaN   
a34_12              NaN       NaN         NaN       NaN         NaN   
j11_12              NaN       NaN         NaN       NaN         NaN   
year                NaN       NaN         NaN       NaN         NaN   
composite_score     NaN       NaN         NaN       NaN         NaN   

                 n_living_child_03

1. Health and Functionality Features

- Overall Health Score: Combine self-reported global health and the number of ADL/IADL limitations to create a composite health score.

- Formula: overall_health_score = (self_reported_health) - (n_adl + n_iadl)
- Physical Limitations: Create a binary feature indicating any physical limitations.

- Formula: physical_limitations = max(adl_dress, adl_walk, adl_bath, adl_eat, adl_bed, adl_toilet)
- Depression Symptoms Score: Combine multiple depression-related features to create a composite depression score.

- Formula: depression_score = n_depr + (depressed + sad + lonely + tired + restless)

In [22]:
# 1. Health and Functionality Features
# Overall Health Score: Composite health score
combined_data_cleaned['overall_health_score'] = (
    combined_data_cleaned['glob_hlth_03'] - 
    combined_data_cleaned['n_adl_03'] - 
    combined_data_cleaned['n_iadl_03']
)

# Physical Limitations: Binary feature indicating any physical limitations
combined_data_cleaned['physical_limitations'] = (
    (combined_data_cleaned[['adl_dress_03', 'adl_walk_03', 'adl_bath_03', 
                            'adl_eat_03', 'adl_bed_03', 'adl_toilet_03']] > 0).any(axis=1).astype(int)
)

# Depression Symptoms Score: Composite score based on depression-related features
combined_data_cleaned['depression_score'] = (
    combined_data_cleaned['n_depr_03'] + 
    combined_data_cleaned[['depressed_03', 'sad_03', 'lonely_03', 'tired_03', 'restless_03']].sum(axis=1)
)

2. Social Determinants of Health

- Social Isolation Score: Create a score based on social activities, frequency of seeing friends and relatives, and participation in religious services.

- Formula: social_isolation_score = (frequency_of_social_activities + rrelgwk) / 2
- Economic Strain Index: Create an index that reflects financial stability based on income and health coverage.

- ormula: economic_strain_index = (1 - insured) + (1 - (insur_private + insur_other)) + (hincome < threshold)

In [23]:
# 2. Social Determinants of Health
# Social Isolation Score: Score based on social activities and social contact
combined_data_cleaned['social_isolation_score'] = (
    (combined_data_cleaned['rrelgwk_12'] + combined_data_cleaned['seg_pop_12']) / 2
)

# Economic Strain Index: Index reflecting financial stability based on income and health coverage
combined_data_cleaned['economic_strain_index'] = (
    (1 - combined_data_cleaned['insured_12']) + 
    (1 - (combined_data_cleaned['insur_private_12'] + combined_data_cleaned['insur_other_12'])) + 
    (combined_data_cleaned['hincome_12'] < 20000).astype(int)  # Example threshold
)


3. Lifestyle Features

-Healthy Lifestyle Score: Create a score based on exercise, diet (considering alcohol and tobacco use), and health check-ups.

- Formula: healthy_lifestyle_score = (exer_3xwk + (1 - alcohol) + (1 - tobacco) + test_chol + test_diab + test_pres)
- Mental Engagement Score: Measure engagement in activities that can help maintain cognitive function (e.g., reading, games).

Formula: mental_engagement_score = (reads + games + sewing + tv)

In [24]:
# 3. Lifestyle Features
# Healthy Lifestyle Score: Score based on lifestyle factors
combined_data_cleaned['healthy_lifestyle_score'] = (
    combined_data_cleaned['exer_3xwk_12'] + 
    (1 - combined_data_cleaned['alcohol_12']) + 
    (1 - combined_data_cleaned['tobacco_12']) + 
    combined_data_cleaned[['test_chol_12', 'test_diab_12', 'test_pres_12']].sum(axis=1)
)

# Mental Engagement Score: Engagement in activities
combined_data_cleaned['mental_engagement_score'] = (
    combined_data_cleaned[['reads_12', 'games_12', 'sewing_12', 'tv_12']].sum(axis=1)
)

4. Interaction Features

- Interaction of Age and Health: Capture the interaction between age group and self-reported health.

- Formula: age_health_interaction = age * glob_hlth (where age is converted into a numeric value)
- Education and Health: Create an interaction feature between education level and overall health score.

- Formula: education_health_interaction = edu_gru * overall_health_score

In [25]:

# 4. Interaction Features
# Interaction of Age and Health: Interaction between age and self-reported health
combined_data_cleaned['age_health_interaction'] = (
    combined_data_cleaned['age_12'] * combined_data_cleaned['glob_hlth_12']
)

# Education and Health: Interaction feature between education level and overall health score
combined_data_cleaned['education_health_interaction'] = (
    combined_data_cleaned['edu_gru_12'] * combined_data_cleaned['overall_health_score']
)


5. Temporal Features

- Change in Health Over Time: Calculate the change in the number of ADL and IADL limitations from 2003 to 2012.

- Formula: change_in_adl = n_adl_12 - n_adl_03
- Formula: change_in_iadl = n_iadl_12 - n_iadl_03
- Longitudinal Change in Depression: Measure the change in depression score over the years.

- Formula: change_in_depression = depression_score_12 - depression_score_03

In [26]:

# 5. Temporal Features
# Change in Health Over Time: Changes in ADL and IADL limitations from 2003 to 2012
combined_data_cleaned['change_in_adl'] = (
    combined_data_cleaned['n_adl_12'] - combined_data_cleaned['n_adl_03']
)
combined_data_cleaned['change_in_iadl'] = (
    combined_data_cleaned['n_iadl_12'] - combined_data_cleaned['n_iadl_03']
)

# Longitudinal Change in Depression: Change in depression score over the years
combined_data_cleaned['change_in_depression'] = (
    combined_data_cleaned['n_depr_12'] - combined_data_cleaned['n_depr_03']
)

6. Aggregate Features

- Aggregate Illness Count: Create a new feature that counts the number of chronic illnesses reported (hypertension, diabetes, etc.).

- Formula: chronic_illness_count = (hypertension + diabetes + resp_ill + arthritis + hrt_attack + stroke + cancer)
- Total Caregiving Index: A combined measure reflecting the amount of time spent caregiving, considering both adult and child care.

- Formula: total_caregiving_index = care_adult + care_child

In [27]:
# 6. Aggregate Features
# Aggregate Illness Count: Counting chronic illnesses
combined_data_cleaned['chronic_illness_count'] = (
    combined_data_cleaned[['hypertension_03', 'diabetes_03', 'resp_ill_03', 
                           'arthritis_03', 'hrt_attack_03', 'stroke_03', 
                           'cancer_03']].sum(axis=1)
)

# Total Caregiving Index: Time spent caregiving for adults and children
combined_data_cleaned['total_caregiving_index'] = (
    combined_data_cleaned['care_adult_12'] + 
    combined_data_cleaned['care_child_12']
)

In [28]:
combined_data_cleaned.head()

Unnamed: 0,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,adl_walk_03,...,economic_strain_index,healthy_lifestyle_score,mental_engagement_score,age_health_interaction,education_health_interaction,change_in_adl,change_in_iadl,change_in_depression,chronic_illness_count,total_caregiving_index
0,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,0.0,5.0,2.0,8.0,0.0,0.115188,-0.047181,-3.485267,0.0,0.0
1,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,1.0,6.0,3.0,4.0,11.637394,-0.073687,-0.047181,0.514733,0.0,1.0
2,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,2.0,5.0,3.0,4.0,3.879131,0.926313,-0.047181,0.514733,0.0,1.0
3,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,2.0,5.0,3.0,4.0,3.879131,0.926313,-0.047181,0.514733,0.0,1.0
4,1.0,1.0,3.0,1.0,3.0,1.0,0.0,4.0,0.0,0.0,...,2.0,4.0,3.0,8.0,12.0,0.0,0.0,2.0,1.0,0.0


In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Assuming `combined_data_cleaned` is already loaded
X = combined_data_cleaned.drop(columns=['composite_score'])
y = combined_data_cleaned['composite_score']

# Preprocessing: Scaling and Handling Outliers
# Pipeline for continuous features
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handling NaNs
    ('scaler', StandardScaler())  # Scaling the data
])

# Applying the transformation
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features)
])

# Preprocessing the data
X = preprocessor.fit_transform(X)


### Using SelectKBest with Different Scoring Functions

We’ll use SelectKBest with chi2 (requires non-negative data) and mutual_info_regression:

In [30]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, mutual_info_regression, f_regression

In [31]:
# Ensuring non-negative data for chi2
X_non_neg = np.where(X < 0, X - X.min(axis=0), X)

# SelectKBest with chi2
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X_non_neg, y)
chi2_scores = chi2_selector.scores_

# SelectKBest with mutual information
mutual_info_selector = SelectKBest(score_func=mutual_info_regression, k='all')
mutual_info_selector.fit(X, y)
mutual_info_scores = mutual_info_selector.scores_

# SelectKBest with f_regression (ANOVA F-value)
f_reg_selector = SelectKBest(score_func=f_regression, k='all')
f_reg_selector.fit(X, y)
f_reg_scores = f_reg_selector.scores_


In [32]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_regression, f_regression
import pandas as pd

# Adjust actual_feature_names to match the number of features in X
if not isinstance(X, pd.DataFrame):
    # Match feature names to the shape of X if they're mismatched
    actual_feature_names = combined_data_cleaned.columns[:X.shape[1]].tolist()
    X_df = pd.DataFrame(X, columns=actual_feature_names)
else:
    X_df = X

# Ensure non-negative data for chi2 by adjusting any negative values
X_non_neg = X_df.apply(lambda x: x - x.min() if x.min() < 0 else x)

# SelectKBest with chi2
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X_non_neg, y)
chi2_scores = chi2_selector.scores_

# SelectKBest with mutual information
mutual_info_selector = SelectKBest(score_func=mutual_info_regression, k='all')
mutual_info_selector.fit(X_df, y)
mutual_info_scores = mutual_info_selector.scores_

# SelectKBest with f_regression (ANOVA F-value)
f_reg_selector = SelectKBest(score_func=f_regression, k='all')
f_reg_selector.fit(X_df, y)
f_reg_scores = f_reg_selector.scores_

# Combining scores into a DataFrame for ranking
feature_scores = pd.DataFrame({
    'feature': X_df.columns,
    'chi2': chi2_scores,
    'mutual_info': mutual_info_scores,
    'f_regression': f_reg_scores
})

# Ranking features in descending order for each score type
feature_scores['chi2_rank'] = feature_scores['chi2'].rank(ascending=False)
feature_scores['mutual_info_rank'] = feature_scores['mutual_info'].rank(ascending=False)
feature_scores['f_regression_rank'] = feature_scores['f_regression'].rank(ascending=False)

# Calculating the average rank across methods
feature_scores['average_rank'] = feature_scores[['chi2_rank', 'mutual_info_rank', 'f_regression_rank']].mean(axis=1)

# Sorting features by their average rank (lower rank means higher importance)
most_important_features = feature_scores.sort_values('average_rank').reset_index(drop=True)

# Displaying the top features based on their average rank
print(most_important_features[['feature', 'average_rank']].head(15))


                    feature  average_rank
0                edu_gru_12     15.000000
1                  games_12     23.000000
2   mental_engagement_score     27.333333
3                 rameduc_m     28.333333
4              rearnings_12     28.666667
5                edu_gru_03     31.333333
6              rearnings_03     31.333333
7            table_games_12     34.000000
8                    j11_12     34.000000
9                  reads_12     35.000000
10          rinc_pension_12     38.333333
11  healthy_lifestyle_score     39.666667
12                  hard_03     42.333333
13                rafeduc_m     43.000000
14    economic_strain_index     43.333333


In [33]:
combined_data_cleaned

Unnamed: 0,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,adl_walk_03,...,economic_strain_index,healthy_lifestyle_score,mental_engagement_score,age_health_interaction,education_health_interaction,change_in_adl,change_in_iadl,change_in_depression,chronic_illness_count,total_caregiving_index
0,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,0.0,5.0,2.0,8.0,0.000000,0.115188,-0.047181,-3.485267,0.0,0.0
1,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,1.0,6.0,3.0,4.0,11.637394,-0.073687,-0.047181,0.514733,0.0,1.0
2,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,2.0,5.0,3.0,4.0,3.879131,0.926313,-0.047181,0.514733,0.0,1.0
3,1.0,1.0,3.0,1.133275,3.0,1.0,0.0,4.0,0.0,0.0,...,2.0,5.0,3.0,4.0,3.879131,0.926313,-0.047181,0.514733,0.0,1.0
4,1.0,1.0,3.0,1.000000,3.0,1.0,0.0,4.0,0.0,0.0,...,2.0,4.0,3.0,8.0,12.000000,0.000000,0.000000,2.000000,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4337,1.0,0.0,1.0,1.000000,1.0,4.0,0.0,5.0,0.0,0.0,...,1.0,4.0,3.0,4.0,5.000000,0.000000,0.000000,1.000000,2.0,0.0
4338,1.0,0.0,1.0,1.000000,1.0,4.0,0.0,5.0,0.0,0.0,...,1.0,4.0,3.0,4.0,5.000000,0.000000,0.000000,1.000000,2.0,0.0
4339,1.0,0.0,3.0,1.000000,1.0,1.0,0.0,4.0,0.0,0.0,...,2.0,3.0,0.0,6.0,4.000000,0.000000,0.000000,1.000000,2.0,0.0
4340,1.0,0.0,3.0,1.000000,1.0,1.0,0.0,4.0,0.0,0.0,...,2.0,3.0,0.0,6.0,4.000000,0.000000,0.000000,1.000000,2.0,0.0


### Feature Importances from Tree-Based Models

We’ll use RandomForestRegressor and DecisionTreeRegressor to calculate importance scores:

In [34]:
from sklearn.tree import DecisionTreeRegressor 

In [35]:
# Feature importances from Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)
rf_importances = rf.feature_importances_

# Feature importances from Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X, y)
dt_importances = dt.feature_importances_


### Feature Importances from Lasso Regression

Lasso regression performs feature selection by applying L1 regularization.

In [36]:
from sklearn.linear_model import LassoCV 

In [37]:
# Feature importance from Lasso Regression
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X, y)
lasso_importances = np.abs(lasso.coef_)


### Organizing Feature Importance Scores 

Combine all feature importance scores into a DataFrame for comparison:

In [38]:
# Assuming you have already run your feature selection code and have your scores calculated

# Combining scores into a DataFrame for ranking
feature_importances_df = pd.DataFrame({
    'Feature': X_df.columns,  # Use X_df to access the column names
    'Chi2': chi2_scores,
    'Mutual_Info': mutual_info_scores,
    'F_Regression': f_reg_scores,
    'DT_Importances': dt_importances,
    'Lasso_Importances': lasso_importances
})

# Ranking features in descending order for each score type
feature_importances_df['chi2_rank'] = feature_importances_df['Chi2'].rank(ascending=False)
feature_importances_df['mutual_info_rank'] = feature_importances_df['Mutual_Info'].rank(ascending=False)
feature_importances_df['f_regression_rank'] = feature_importances_df['F_Regression'].rank(ascending=False)
feature_importances_df['dt_rank'] = feature_importances_df['DT_Importances'].rank(ascending=False)
feature_importances_df['lasso_rank'] = feature_importances_df['Lasso_Importances'].rank(ascending=False)

# Calculating the average rank across methods
feature_importances_df['average_rank'] = feature_importances_df[['chi2_rank', 'mutual_info_rank', 'f_regression_rank', 'dt_rank', 'lasso_rank']].mean(axis=1)

# Sorting features by their average rank (lower rank means higher importance)
most_important_features = feature_importances_df.sort_values('average_rank').reset_index(drop=True)

# Displaying the top features based on their average rank
print(most_important_features[['Feature', 'average_rank']].head(15))


                    Feature  average_rank
0                edu_gru_12           9.4
1                    j11_12          25.0
2                 rameduc_m          26.2
3                    age_12          27.6
4     economic_strain_index          27.6
5            table_games_12          32.4
6                  reads_12          32.4
7              rrfcntx_m_12          32.4
8   healthy_lifestyle_score          36.2
9         n_living_child_12          36.2
10             rearnings_12          40.2
11               hincome_03          40.8
12             rsocact_m_12          41.4
13            care_child_12          41.8
14                   age_03          41.8


Selecting the Most Frequently Ranked Features

Now, we will find the top features that consistently rank highly across different methods.

In [39]:
# Sorting features by average rank
top_features = feature_importances_df.sort_values(by='average_rank').head(15)
top_features_list = top_features['Feature'].tolist()
print("Top features across all methods:", top_features_list)



Top features across all methods: ['edu_gru_12', 'j11_12', 'rameduc_m', 'age_12', 'economic_strain_index', 'table_games_12', 'reads_12', 'rrfcntx_m_12', 'healthy_lifestyle_score', 'n_living_child_12', 'rearnings_12', 'hincome_03', 'rsocact_m_12', 'care_child_12', 'age_03']


Specifying the Number of Top Features: Decide on the number of top features to keep. For example, let’s say we want the top 15 features.

In [40]:

print(most_important_features.columns)
top_features = most_important_features['Feature'].head(15).tolist()

# Extract the top features from X_df based on the identified top features
X_top = X_df[top_features]


Index(['Feature', 'Chi2', 'Mutual_Info', 'F_Regression', 'DT_Importances',
       'Lasso_Importances', 'chi2_rank', 'mutual_info_rank',
       'f_regression_rank', 'dt_rank', 'lasso_rank', 'average_rank'],
      dtype='object')


In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_top_scaled, y, test_size=0.2, random_state=42)


NameError: name 'X_top_scaled' is not defined

In [None]:


scaler = StandardScaler()
X_top_scaled = scaler.fit_transform(X_top)

Recommended Metrics for Different Scenarios

High Focus on Identifying All Sick Individuals:

Recall: This should be your primary metric if maximizing the count of correctly identified sick individuals is most important.
Balance Between Identifying Sick Individuals and Avoiding False Alarms:

F1-score: This is helpful if you want a balance between high recall (catching sick individuals) and high precision (avoiding false positives).
Secondary Metrics for Insight:

Confusion Matrix: To visually check true positives, false positives, and false negatives.
ROC-AUC: To check the model’s overall ability to distinguish sick versus healthy individuals.

In [44]:
from sklearn.metrics import make_scorer, recall_score, f1_score, roc_auc_score, confusion_matrix

In [45]:
# Define Custom Metrics for Evaluation
# ==========================
def recall_scorer(y_true, y_pred):
    return recall_score(y_true, y_pred, average='weighted')

def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')

# Define the evaluation metrics
scoring = {
    'Recall': make_scorer(recall_scorer),
    'F1-Score': make_scorer(f1_scorer),
}

In [46]:
# Initialize Results Storage
# ==========================
results = []

# ==========================
# Cross-Validation Setup
# ==========================
cv = TimeSeriesSplit(n_splits=5)


In [69]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [71]:
# Ridge Regression
ridge_model = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 10]}
cv = TimeSeriesSplit(n_splits=5)

ridge_grid_search = GridSearchCV(ridge_model, ridge_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
ridge_best_model = ridge_grid_search.best_estimator_
ridge_y_pred = ridge_best_model.predict(X_test)

# Calculate metrics for Ridge Regression
ridge_mae = mean_absolute_error(y_test, ridge_y_pred)
ridge_mse = mean_squared_error(y_test, ridge_y_pred)
ridge_rmse = np.sqrt(ridge_mse)
ridge_r2 = r2_score(y_test, ridge_y_pred)

# Store results for Ridge Regression
ridge_results = {
    'Model': 'Ridge',
    'Best Parameters': ridge_grid_search.best_params_,
    'MAE': ridge_mae,
    'MSE': ridge_mse,
    'RMSE': ridge_rmse,
    'R-squared': ridge_r2
}

In [72]:
# Lasso Regression
lasso_model = Lasso()
lasso_param_grid = {'alpha': [0.01, 0.1, 1.0]}

lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
lasso_best_model = lasso_grid_search.best_estimator_
lasso_y_pred = lasso_best_model.predict(X_test)

# Calculate metrics for Lasso Regression
lasso_mae = mean_absolute_error(y_test, lasso_y_pred)
lasso_mse = mean_squared_error(y_test, lasso_y_pred)
lasso_rmse = np.sqrt(lasso_mse)
lasso_r2 = r2_score(y_test, lasso_y_pred)

# Store results for Lasso Regression
lasso_results = {
    'Model': 'Lasso',
    'Best Parameters': lasso_grid_search.best_params_,
    'MAE': lasso_mae,
    'MSE': lasso_mse,
    'RMSE': lasso_rmse,
    'R-squared': lasso_r2
}

In [73]:
# K-Neighbors Regression
knn_model = KNeighborsRegressor()
knn_param_grid = {'n_neighbors': [3, 5, 7]}

knn_grid_search = GridSearchCV(knn_model, knn_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
knn_grid_search.fit(X_train, y_train)
knn_best_model = knn_grid_search.best_estimator_
knn_y_pred = knn_best_model.predict(X_test)

# Calculate metrics for K-Neighbors Regression
knn_mae = mean_absolute_error(y_test, knn_y_pred)
knn_mse = mean_squared_error(y_test, knn_y_pred)
knn_rmse = np.sqrt(knn_mse)
knn_r2 = r2_score(y_test, knn_y_pred)

# Store results for K-Neighbors Regression
knn_results = {
    'Model': 'KNeighbors',
    'Best Parameters': knn_grid_search.best_params_,
    'MAE': knn_mae,
    'MSE': knn_mse,
    'RMSE': knn_rmse,
    'R-squared': knn_r2
}

In [74]:
# Random Forest Regression
rf_model = RandomForestRegressor()
rf_param_grid = {'n_estimators': [50, 100, 200]}

rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)

# Calculate metrics for Random Forest Regression
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_y_pred)

# Store results for Random Forest Regression
rf_results = {
    'Model': 'Random Forest',
    'Best Parameters': rf_grid_search.best_params_,
    'MAE': rf_mae,
    'MSE': rf_mse,
    'RMSE': rf_rmse,
    'R-squared': rf_r2
}

In [75]:
# Gradient Boosting Regression
gb_model = GradientBoostingRegressor()
gb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}

gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_y_pred = gb_best_model.predict(X_test)

# Calculate metrics for Gradient Boosting Regression
gb_mae = mean_absolute_error(y_test, gb_y_pred)
gb_mse = mean_squared_error(y_test, gb_y_pred)
gb_rmse = np.sqrt(gb_mse)
gb_r2 = r2_score(y_test, gb_y_pred)

# Store results for Gradient Boosting Regression
gb_results = {
    'Model': 'Gradient Boosting',
    'Best Parameters': gb_grid_search.best_params_,
    'MAE': gb_mae,
    'MSE': gb_mse,
    'RMSE': gb_rmse,
    'R-squared': gb_r2
}

In [76]:
# XGBoost Regression
xgb_model = XGBRegressor()
xgb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}

xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_y_pred = xgb_best_model.predict(X_test)

# Calculate metrics for XGBoost Regression
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_mse = mean_squared_error(y_test, xgb_y_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_r2 = r2_score(y_test, xgb_y_pred)

# Store results for XGBoost Regression
xgb_results = {
    'Model': 'XGBoost',
    'Best Parameters': xgb_grid_search.best_params_,
    'MAE': xgb_mae,
    'MSE': xgb_mse,
    'RMSE': xgb_rmse,
    'R-squared': xgb_r2
}

In [77]:
# LightGBM Regression
lgbm_model = LGBMRegressor()
lgbm_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}

lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lgbm_grid_search.fit(X_train, y_train)
lgbm_best_model = lgbm_grid_search.best_estimator_
lgbm_y_pred = lgbm_best_model.predict(X_test)

# Calculate metrics for LightGBM Regression
lgbm_mae = mean_absolute_error(y_test, lgbm_y_pred)
lgbm_mse = mean_squared_error(y_test, lgbm_y_pred)
lgbm_rmse = np.sqrt(lgbm_mse)
lgbm_r2 = r2_score(y_test, lgbm_y_pred)

# Store results for LightGBM Regression
lgbm_results = {
    'Model': 'LightGBM',
    'Best Parameters': lgbm_grid_search.best_params_,
    'MAE': lgbm_mae,
    'MSE': lgbm_mse,
    'RMSE': lgbm_rmse,
    'R-squared': lgbm_r2
}


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 3187, number of used features: 15
[LightGBM] [Info] Start training from score 157.892375


In [78]:
# CatBoost Regression
cat_model = CatBoostRegressor(verbose=0)
cat_param_grid = {'iterations': [50, 100], 'learning_rate': [0.05, 0.1]}

cat_grid_search = GridSearchCV(cat_model, cat_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
cat_grid_search.fit(X_train, y_train)
cat_best_model = cat_grid_search.best_estimator_
cat_y_pred = cat_best_model.predict(X_test)

# Calculate metrics for CatBoost Regression
cat_mae = mean_absolute_error(y_test, cat_y_pred)
cat_mse = mean_squared_error(y_test, cat_y_pred)
cat_rmse = np.sqrt(cat_mse)
cat_r2 = r2_score(y_test, cat_y_pred)

# Store results for CatBoost Regression
cat_results = {
    'Model': 'CatBoost',
    'Best Parameters': cat_grid_search.best_params_,
    'MAE': cat_mae,
    'MSE': cat_mse,
    'RMSE': cat_rmse,
    'R-squared': cat_r2
}

In [79]:
# Combine all results into a DataFrame
results_df = pd.DataFrame([
    ridge_results,
    lasso_results,
    knn_results,
    rf_results,
    gb_results,
    xgb_results,
    lgbm_results,
    cat_results
])

# Print the results
print(results_df)


               Model                               Best Parameters        MAE  \
0              Ridge                                 {'alpha': 10}  32.194873   
1              Lasso                                {'alpha': 0.1}  32.200801   
2         KNeighbors                            {'n_neighbors': 7}  35.085141   
3      Random Forest                         {'n_estimators': 200}  30.561123   
4  Gradient Boosting    {'learning_rate': 0.1, 'n_estimators': 50}  32.091949   
5            XGBoost  {'learning_rate': 0.05, 'n_estimators': 100}  31.751859   
6           LightGBM  {'learning_rate': 0.05, 'n_estimators': 100}  31.853513   
7           CatBoost     {'iterations': 100, 'learning_rate': 0.1}  31.753972   

           MSE       RMSE  R-squared  
0  1687.586761  41.080248   0.529308  
1  1687.883441  41.083859   0.529225  
2  1996.999155  44.687796   0.443008  
3  1602.757876  40.034459   0.552968  
4  1675.471961  40.932529   0.532687  
5  1661.244342  40.758365   0.536655

In [128]:
# Convert wide to long format
data_long = pd.wide_to_long(
    long_data ,
    stubnames=[
        'age', 'urban', 'married', 'n_mar', 'edu_gru', 'n_living_child', 'migration', 
        'glob_hlth', 'adl_dress', 'adl_walk', 'adl_bath', 'adl_eat', 'adl_bed', 'adl_toilet', 
        'n_adl', 'iadl_money', 'iadl_meds', 'iadl_shop', 'iadl_meals', 'n_iadl', 'depressed', 
        'hard', 'restless', 'happy', 'lonely', 'enjoy', 'sad', 'tired', 'energetic', 'n_depr', 
        'cesd_depressed', 'hypertension', 'diabetes', 'resp_ill', 'arthritis', 'hrt_attack', 
        'stroke', 'cancer', 'n_illnesses', 'bmi', 'exer_3xwk', 'alcohol', 'tobacco', 
        'test_chol', 'test_tuber', 'test_diab', 'test_pres', 'hosp', 'visit_med', 
        'out_proc', 'visit_dental', 'imss', 'issste', 'pem_def_mar', 'insur_private', 
        'insur_other', 'insured', 'decis_famil', 'decis_personal', 'employment', 
        'rjob_hrswk', 'rjlocc_m', 'rjob_end', 'rjobend_reason', 'rearnings', 
        'searnings', 'hincome', 'hinc_business', 'hinc_rent', 'hinc_assets', 'hinc_cap', 
        'rinc_pension', 'sinc_pension', 'rrelgimp', 'rrfcntx_m', 'rsocact_m', 'rrelgwk', 
        'a34', 'j11'
    ],
    i=['uid', 'year', 'composite_score'],  # Use 'uid', 'year', and 'composite_score' as identifiers
    j='time',  # Variable to capture time (03 or 12)
    sep='_',  # Separator used in wide-format column names
    suffix='\\d+'  # Regular expression to capture suffixes (03 or 12)
).reset_index()

# Display the first few rows to verify
data_long.head()


Unnamed: 0,uid,year,composite_score,time,seg_pop_12,comms_tel_comp_12,tv_12,vax_flu_12,table_games_12,satis_excel_12,...,hinc_assets,hinc_cap,rinc_pension,sinc_pension,rrelgimp,rrfcntx_m,rsocact_m,rrelgwk,a34,j11
0,aace,2021.0,175.0,3,1.0,0.0,1.0,0.0,0.0,3.0,...,,,,,,,,,,
1,aace,2021.0,175.0,12,1.0,0.0,1.0,0.0,0.0,3.0,...,0.0,10000.0,0.0,0.0,2.0,9.0,9.0,0.0,,2.0
2,aanz,2021.0,206.0,3,0.0,0.0,1.0,1.0,0.0,2.0,...,,,,,,,,,,
3,aanz,2021.0,206.0,12,0.0,0.0,1.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,1.0,9.0,1.0,0.0,,2.0
4,aape,2016.0,161.0,3,0.0,1.0,1.0,0.0,0.0,1.0,...,,,,,,,,,,


In [129]:
data_long.shape

(10324, 113)

In [130]:
# Printing the missing values sorted in descending order
print(data_long.isna().sum().sort_values(ascending=False).to_list())


[10244, 10208, 10192, 10192, 9036, 8995, 7128, 7040, 6588, 5482, 5470, 5468, 5275, 4952, 4592, 4395, 4395, 3652, 3371, 2270, 2251, 2247, 2247, 2228, 2225, 2223, 2222, 2220, 2216, 2211, 2210, 2210, 2209, 2208, 2207, 2206, 2206, 2205, 2205, 2205, 2205, 2204, 2204, 2204, 1962, 1904, 1884, 1882, 1879, 1874, 1866, 1863, 1859, 1859, 1858, 1851, 1850, 1848, 1847, 1847, 1847, 1844, 1842, 1841, 1841, 1840, 1840, 1840, 1840, 1839, 1839, 1838, 1837, 1837, 1837, 1836, 1836, 1835, 1835, 1835, 1835, 1819, 1819, 1819, 1819, 1638, 1638, 1506, 1250, 840, 790, 722, 704, 692, 658, 642, 640, 640, 624, 622, 620, 618, 618, 616, 616, 616, 612, 612, 612, 266, 0, 0, 0]


In [131]:
# Printing the missing values with column names, sorted in descending order
print(data_long.isna().sum().sort_values(ascending=False))


a16a_12            10244
a22_12             10208
a33b_12            10192
a21_12             10192
rjob_end            9036
                   ...  
attends_club_12      612
seg_pop_12           266
time                   0
ragender               0
uid                    0
Length: 113, dtype: int64


In [132]:
numeric_data = data_long.select_dtypes(include=['float64', 'int64'])
categorical_data = data_long.select_dtypes(exclude=['float64', 'int64'])

In [133]:

missing_percentages = (data_long.isna().sum() / len(data_long)) * 100


high_missing_columns = missing_percentages[missing_percentages > 40].index

data_long = data_long.drop(columns=high_missing_columns)


print("Columns remaining after dropping those with more than 40% missing values:")
print(data_long.columns)


Columns remaining after dropping those with more than 40% missing values:
Index(['uid', 'year', 'composite_score', 'time', 'seg_pop_12',
       'comms_tel_comp_12', 'tv_12', 'vax_flu_12', 'table_games_12',
       'satis_excel_12', 'volunteer_12', 'wouldnt_change_12',
       'attends_club_12', 'reads_12', 'ragender', 'care_child_12',
       'satis_ideal_12', 'games_12', 'satis_fine_12', 'attends_class_12',
       'act_mant_12', 'cosas_imp_12', 'vax_pneu_12', 'memory_12', 'sewing_12',
       'sgender_12', 'care_adult_12', 'rameduc_m', 'rafeduc_m', 'age', 'urban',
       'married', 'n_mar', 'edu_gru', 'n_living_child', 'migration',
       'glob_hlth', 'adl_dress', 'adl_walk', 'adl_bath', 'adl_eat', 'adl_bed',
       'adl_toilet', 'n_adl', 'iadl_money', 'iadl_meds', 'iadl_shop',
       'iadl_meals', 'n_iadl', 'depressed', 'hard', 'restless', 'happy',
       'lonely', 'enjoy', 'sad', 'tired', 'energetic', 'n_depr',
       'cesd_depressed', 'hypertension', 'diabetes', 'resp_ill', 'arthritis'

In [134]:

imputer_numeric = SimpleImputer(strategy='median')
imputer_categorical = SimpleImputer(strategy='most_frequent')


numeric_data_imputed = pd.DataFrame(imputer_numeric.fit_transform(numeric_data), columns=numeric_data.columns)
categorical_data_imputed = pd.DataFrame(imputer_categorical.fit_transform(categorical_data), columns=categorical_data.columns)


data_long_imputed = pd.concat([numeric_data_imputed, categorical_data_imputed], axis=1)


print("Number of missing values after imputation:")


Number of missing values after imputation:


In [135]:
print(data_long_imputed.isna().sum().sum())

0


In [136]:
data_long_imputed.head()

Unnamed: 0,year,composite_score,time,seg_pop_12,comms_tel_comp_12,tv_12,vax_flu_12,table_games_12,satis_excel_12,volunteer_12,...,hinc_cap,rinc_pension,sinc_pension,rrelgimp,rrfcntx_m,rsocact_m,rrelgwk,a34,j11,uid
0,2021.0,175.0,3.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,1.0,4.0,8.0,0.0,2.0,1.0,aace
1,2021.0,175.0,12.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,...,10000.0,0.0,0.0,2.0,9.0,9.0,0.0,2.0,2.0,aace
2,2021.0,206.0,3.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,4.0,8.0,0.0,2.0,1.0,aanz
3,2021.0,206.0,12.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,9.0,1.0,0.0,2.0,2.0,aanz
4,2016.0,161.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,4.0,8.0,0.0,2.0,1.0,aape


In [137]:
data_long_imputed.dtypes.unique()

array([dtype('float64'), dtype('O')], dtype=object)

In [138]:

unique_values = data_long_imputed.apply(lambda x: x.unique())


for col, unique_vals in unique_values.items():
    print(f"Column: {col} - Unique values: {unique_vals}")


Column: year - Unique values: [2021. 2016.]
Column: composite_score - Unique values: [175. 206. 161. 144. 104. 183. 106. 152.  13. 193.  38. 272. 254.  87.
  92. 203. 117.  84.  17.  48.  93. 140.  69. 218.  78. 122. 157. 162.
  40. 126. 178. 137.  76. 202.  86. 133. 225. 131. 231. 154.  71.  64.
 118. 151. 127. 163. 179. 198. 108. 119.  91. 107. 174. 120.  66.  56.
 246. 195. 281. 289. 159. 121. 200. 248. 258. 191. 209.  39. 267. 247.
 150. 223. 226. 205. 158. 185.  60. 146. 167.  63. 204.  52. 181. 220.
 215. 229.  97. 234. 232. 115.  54. 103. 186. 256.  99. 171.  51.  47.
 124. 147.  68. 130. 136. 148. 311. 134. 153. 100. 221. 199.  14.  42.
 143.  25. 164.  74.  20. 216.   4. 235. 141. 264. 269. 173. 155. 125.
 165. 194. 214. 277.  65. 184. 212. 210. 102.  53.  21.  57.  89.  80.
 101. 245. 236.  26. 105. 251.  88. 197.  95. 213. 109. 169. 189.  50.
  45. 240. 135. 177. 252. 156. 145. 176. 149. 116. 170.  98. 201. 237.
 129. 139. 196.  90. 211. 142.  75.  72.  22. 166. 113.  30. 18

In [139]:

data_long_imputed = pd.get_dummies(data_long_imputed, columns=['uid'])


print(data_long_imputed.head())


     year  composite_score  time  seg_pop_12  comms_tel_comp_12  tv_12  \
0  2021.0            175.0   3.0         1.0                0.0    1.0   
1  2021.0            175.0  12.0         1.0                0.0    1.0   
2  2021.0            206.0   3.0         0.0                0.0    1.0   
3  2021.0            206.0  12.0         0.0                0.0    1.0   
4  2016.0            161.0   3.0         0.0                1.0    1.0   

   vax_flu_12  table_games_12  satis_excel_12  volunteer_12  ...  uid_zytb  \
0         0.0             0.0             3.0           0.0  ...     False   
1         0.0             0.0             3.0           0.0  ...     False   
2         1.0             0.0             2.0           0.0  ...     False   
3         1.0             0.0             2.0           0.0  ...     False   
4         0.0             0.0             1.0           0.0  ...     False   

   uid_zyxh  uid_zzab  uid_zzag  uid_zzci  uid_zzez  uid_zzft  uid_zzhd  \
0     False

In [140]:
X = data_long_imputed.drop(columns=['composite_score'])  # Drop target and any non-feature columns
y = data_long_imputed['composite_score']

In [141]:
X_train, X_test, y_train, y_test =  train_test_split(X,y , test_size=0.3, random_state=42)

In [83]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from prophet import Prophet

In [106]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)

lr_results = {
    'Model': 'Linear Regression',
    'MAE': mean_absolute_error(y_test, lr_y_pred),
    'MSE': mean_squared_error(y_test, lr_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lr_y_pred)),
    'R-squared': r2_score(y_test, lr_y_pred)
}
model_results.append(lr_results)

In [107]:
# Ridge Regression
ridge_model = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 10]}
ridge_grid_search = GridSearchCV(ridge_model, ridge_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
ridge_best_model = ridge_grid_search.best_estimator_
ridge_y_pred = ridge_best_model.predict(X_test)

ridge_results = {
    'Model': 'Ridge',
    'Best Parameters': ridge_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, ridge_y_pred),
    'MSE': mean_squared_error(y_test, ridge_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, ridge_y_pred)),
    'R-squared': r2_score(y_test, ridge_y_pred)
}
model_results.append(ridge_results)

In [108]:
# Lasso Regression
lasso_model = Lasso()
lasso_param_grid = {'alpha': [0.1, 1.0, 10]}
lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
lasso_best_model = lasso_grid_search.best_estimator_
lasso_y_pred = lasso_best_model.predict(X_test)

lasso_results = {
    'Model': 'Lasso',
    'Best Parameters': lasso_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, lasso_y_pred),
    'MSE': mean_squared_error(y_test, lasso_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lasso_y_pred)),
    'R-squared': r2_score(y_test, lasso_y_pred)
}
model_results.append(lasso_results)

In [109]:
# ElasticNet Regression
en_model = ElasticNet()
en_param_grid = {'alpha': [0.1, 1.0, 10], 'l1_ratio': [0.2, 0.5, 0.8]}
en_grid_search = GridSearchCV(en_model, en_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
en_grid_search.fit(X_train, y_train)
en_best_model = en_grid_search.best_estimator_
en_y_pred = en_best_model.predict(X_test)

en_results = {
    'Model': 'ElasticNet',
    'Best Parameters': en_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, en_y_pred),
    'MSE': mean_squared_error(y_test, en_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, en_y_pred)),
    'R-squared': r2_score(y_test, en_y_pred)
}
model_results.append(en_results)

In [110]:
# DecisionTree Regressor
dt_model = DecisionTreeRegressor()
dt_param_grid = {'max_depth': [5, 10, None], 'min_samples_split': [2, 5]}
dt_grid_search = GridSearchCV(dt_model, dt_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
dt_best_model = dt_grid_search.best_estimator_
dt_y_pred = dt_best_model.predict(X_test)

dt_results = {
    'Model': 'DecisionTree',
    'Best Parameters': dt_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, dt_y_pred),
    'MSE': mean_squared_error(y_test, dt_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, dt_y_pred)),
    'R-squared': r2_score(y_test, dt_y_pred)
}
model_results.append(dt_results)

In [111]:
# Random Forest Regressor
rf_model = RandomForestRegressor()
rf_param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)

rf_results = {
    'Model': 'Random Forest',
    'Best Parameters': rf_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'MSE': mean_squared_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R-squared': r2_score(y_test, rf_y_pred)
}
model_results.append(rf_results)

In [112]:
# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor()
gb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_y_pred = gb_best_model.predict(X_test)

gb_results = {
    'Model': 'Gradient Boosting',
    'Best Parameters': gb_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, gb_y_pred),
    'MSE': mean_squared_error(y_test, gb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, gb_y_pred)),
    'R-squared': r2_score(y_test, gb_y_pred)
}
model_results.append(gb_results)

In [113]:
# XGBoost Regression
xgb_model = XGBRegressor()
xgb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_y_pred = xgb_best_model.predict(X_test)

xgb_results = {
    'Model': 'XGBoost',
    'Best Parameters': xgb_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'MSE': mean_squared_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R-squared': r2_score(y_test, xgb_y_pred)
}
model_results.append(xgb_results)

In [116]:
from lightgbm import LGBMRegressor

In [118]:
# LightGBM Regression
lgbm_model = LGBMRegressor()
lgbm_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lgbm_grid_search.fit(X_train, y_train)
lgbm_best_model = lgbm_grid_search.best_estimator_
lgbm_y_pred = lgbm_best_model.predict(X_test)

lgbm_results = {
    'Model': 'LightGBM',
    'Best Parameters': lgbm_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, lgbm_y_pred),
    'MSE': mean_squared_error(y_test, lgbm_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lgbm_y_pred)),
    'R-squared': r2_score(y_test, lgbm_y_pred)
}
model_results.append(lgbm_results)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 977
[LightGBM] [Info] Number of data points in the train set: 7226, number of used features: 111
[LightGBM] [Info] Start training from score 157.145862


In [119]:
# CatBoost Regression
catboost_model = CatBoostRegressor(verbose=0)
catboost_model.fit(X_train, y_train)
catboost_y_pred = catboost_model.predict(X_test)

catboost_results = {
    'Model': 'CatBoost',
    'MAE': mean_absolute_error(y_test, catboost_y_pred),
    'MSE': mean_squared_error(y_test, catboost_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, catboost_y_pred)),
    'R-squared': r2_score(y_test, catboost_y_pred)
}
model_results.append(catboost_results)

In [120]:
model_results

[{'Model': 'Linear Regression',
  'MAE': 14.443170892090407,
  'MSE': 661.2595449323884,
  'RMSE': 25.71496733290533,
  'R-squared': 0.7846370306721786},
 {'Model': 'Ridge',
  'Best Parameters': {'alpha': 0.1},
  'MAE': 15.639970190323652,
  'MSE': 636.2851862390061,
  'RMSE': 25.224693977113102,
  'R-squared': 0.7927708293999003},
 {'Model': 'Lasso',
  'Best Parameters': {'alpha': 0.1},
  'MAE': 33.710358005506905,
  'MSE': 1834.7836462733503,
  'RMSE': 42.83437458716247,
  'R-squared': 0.40243667231153835},
 {'Model': 'ElasticNet',
  'Best Parameters': {'alpha': 0.1, 'l1_ratio': 0.5},
  'MAE': 33.803437171661464,
  'MSE': 1848.5305498521022,
  'RMSE': 42.9945409308217,
  'R-squared': 0.3979594984144327},
 {'Model': 'DecisionTree',
  'Best Parameters': {'max_depth': 5, 'min_samples_split': 2},
  'MAE': 35.621103725919184,
  'MSE': 2081.7136198825633,
  'RMSE': 45.625799936905906,
  'R-squared': 0.3220150394745286},
 {'Model': 'Random Forest',
  'Best Parameters': {'max_depth': None, '

In [122]:
results_df = pd.DataFrame(model_results)

results_df = results_df.sort_values(by='RMSE', ascending=True)


print(results_df)

               Model        MAE          MSE       RMSE  R-squared  \
1              Ridge  15.639970   636.285186  25.224694   0.792771   
0  Linear Regression  14.443171   661.259545  25.714967   0.784637   
5      Random Forest  29.837863  1469.670898  38.336287   0.521349   
9           CatBoost  31.144867  1524.810811  39.048826   0.503390   
8           LightGBM  31.413385  1595.362410  39.941988   0.480413   
7            XGBoost  31.915932  1631.759311  40.395041   0.468559   
6  Gradient Boosting  33.096121  1749.791501  41.830509   0.430117   
2              Lasso  33.710358  1834.783646  42.834375   0.402437   
3         ElasticNet  33.803437  1848.530550  42.994541   0.397959   
4       DecisionTree  35.621104  2081.713620  45.625800   0.322015   

                               Best Parameters  
1                               {'alpha': 0.1}  
0                                          NaN  
5     {'max_depth': None, 'n_estimators': 100}  
9                                

In [123]:
import pickle

In [124]:


# Save the trained model to a file
with open('best_ridge_model.pkl', 'wb') as file:
    pickle.dump(ridge_best_model, file)


In [125]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib