In [126]:
# Core Libraries
import pandas as pd
import numpy as np
import joblib
import logging
import re
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Feature Selection and Permutation Importance
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.inspection import permutation_importance

# Regression Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor

# Classification Models (used in feature importance or metrics evaluation)
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    recall_score,
    f1_score,
    roc_auc_score,
    make_scorer
)

# General Settings
warnings.filterwarnings('ignore')


In [127]:
# Loading data
train_data = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Data/train_features.csv')
train_labels = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Data/train_labels.csv')
test_data = pd.read_csv('C:/Users/Hp/Desktop/Capstone Project/Data/test_features.csv')


In [128]:
train_data.head()

Unnamed: 0,uid,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,aace,,,,,,,,,,...,2.somewhat important,9.Never,9.Never,0.No,,,,,,Concrete 2
1,aanz,,,,,,,,,,...,1.very important,9.Never,1.Almost every day,0.No,,,,,,Concrete 2
2,aape,,,,,,,,,,...,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"
3,aard,1. 50–59,"1. 100,000+",3. Widowed,1.0,3. 7–9 years,1. 1 or 2,0.0,4. Fair,0.0,...,1.very important,4.Once a week,9.Never,1.Yes,,,,,No 2,Concrete 2
4,ablr,,,,,,,,,,...,1.very important,3.2 or 3 times a week,3.2 or 3 times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"


In [129]:
train_labels.head()

Unnamed: 0,uid,year,composite_score
0,aace,2021,175
1,aanz,2021,206
2,aape,2016,161
3,aape,2021,144
4,aard,2021,104


In [130]:
test_data.head()

Unnamed: 0,uid,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,glob_hlth_03,adl_dress_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,abxu,,,,,,,,,,...,,,,,,,,,,"Wood, mosaic, or other covering 1"
1,aeol,,,,,,,,,,...,1.very important,9.Never,9.Never,1.Yes,,,,,,Concrete 2
2,afnb,,,,,,,,,,...,1.very important,9.Never,3.2 or 3 times a week,1.Yes,,,,,,"Wood, mosaic, or other covering 1"
3,ajfh,,,,,,,,,,...,2.somewhat important,9.Never,5.4 or more times a month,0.No,,,,,,"Wood, mosaic, or other covering 1"
4,ajvq,2. 60–69,"1. 100,000+",1. Married or in civil union,1.0,4. 10+ years,1. 1 or 2,0.0,,,...,2.somewhat important,1.Almost every day,4.Once a week,0.No,,,,,No 2,"Wood, mosaic, or other covering 1"


In [131]:
# Check the data types of the 'uid' column in each dataframe
print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)


(3276, 184)
(4343, 3)
(819, 184)


In [132]:
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [133]:
# This will add labels only for rows in the training data
combined_data = pd.merge(combined_data, train_labels, on='uid', how='left')

# Check the shape and a few rows to confirm
print("Combined data shape after merging:", combined_data.shape)
print(combined_data.head())

Combined data shape after merging: (5162, 186)
    uid    age_03     urban_03  married_03  n_mar_03    edu_gru_03  \
0  aace       NaN          NaN         NaN       NaN           NaN   
1  aanz       NaN          NaN         NaN       NaN           NaN   
2  aape       NaN          NaN         NaN       NaN           NaN   
3  aape       NaN          NaN         NaN       NaN           NaN   
4  aard  1. 50–59  1. 100,000+  3. Widowed       1.0  3. 7–9 years   

  n_living_child_03  migration_03 glob_hlth_03  adl_dress_03  ...  \
0               NaN           NaN          NaN           NaN  ...   
1               NaN           NaN          NaN           NaN  ...   
2               NaN           NaN          NaN           NaN  ...   
3               NaN           NaN          NaN           NaN  ...   
4         1. 1 or 2           0.0      4. Fair           0.0  ...   

               rsocact_m_12  rrelgwk_12  a16a_12  a21_12  a22_12  a33b_12  \
0                   9.Never        0.No 

In [134]:
numeric_data = combined_data.select_dtypes(include=['float64', 'int64'])
categorical_data = combined_data.select_dtypes(exclude=['float64', 'int64'])

In [135]:

missing_percentages = (combined_data.isna().sum() / len(combined_data)) * 100


high_missing_columns = missing_percentages[missing_percentages > 40].index

combined_data = combined_data.drop(columns=high_missing_columns)


print("Columns remaining after dropping those with more than 40% missing values:")
print(combined_data.columns)


Columns remaining after dropping those with more than 40% missing values:
Index(['uid', 'age_03', 'urban_03', 'married_03', 'n_mar_03', 'edu_gru_03',
       'n_living_child_03', 'migration_03', 'glob_hlth_03', 'adl_dress_03',
       ...
       'rinc_pension_12', 'sinc_pension_12', 'rrelgimp_12', 'rrfcntx_m_12',
       'rsocact_m_12', 'rrelgwk_12', 'a34_12', 'j11_12', 'year',
       'composite_score'],
      dtype='object', length=169)


In [136]:

imputer_numeric = SimpleImputer(strategy='median')
imputer_categorical = SimpleImputer(strategy='most_frequent')


numeric_data_imputed = pd.DataFrame(imputer_numeric.fit_transform(numeric_data), columns=numeric_data.columns)
categorical_data_imputed = pd.DataFrame(imputer_categorical.fit_transform(categorical_data), columns=categorical_data.columns)


combined_data_imputed = pd.concat([numeric_data_imputed, categorical_data_imputed], axis=1)


print("Number of missing values after imputation:")


Number of missing values after imputation:


In [137]:
print(combined_data_imputed.isna().sum().sum())

0


In [138]:
combined_data_imputed.head()

Unnamed: 0,n_mar_03,migration_03,adl_dress_03,adl_walk_03,adl_bath_03,adl_eat_03,adl_bed_03,adl_toilet_03,n_adl_03,iadl_money_03,...,rjlocc_m_12,rjobend_reason_12,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a22_12,a33b_12,a34_12,j11_12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"6.Workers in Agriculture, Livestock, Forestry,...",8.Other,2.somewhat important,9.Never,9.Never,0.No,Agriculture/ Animal breeding 01,Neither 3,No 2,Concrete 2
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18.Safety and Security Personnel,8.Other,1.very important,9.Never,1.Almost every day,0.No,Agriculture/ Animal breeding 01,Neither 3,No 2,Concrete 2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"8.Artisans and Workers in Production, Repair, ...",8.Other,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,Agriculture/ Animal breeding 01,Neither 3,No 2,"Wood, mosaic, or other covering 1"
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"8.Artisans and Workers in Production, Repair, ...",8.Other,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,Agriculture/ Animal breeding 01,Neither 3,No 2,"Wood, mosaic, or other covering 1"
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"8.Artisans and Workers in Production, Repair, ...",8.Other,1.very important,4.Once a week,9.Never,1.Yes,Agriculture/ Animal breeding 01,Neither 3,No 2,Concrete 2


In [139]:
# Define mappings for each categorical column
mappings = {
    'age_03': {
        '1. 50–59': 1,
        '3. 70–79': 3,
        '2. 60–69': 2,
        '0. 49 or younger': 0,
        '4. 80+': 4
    },
    'urban_03': {
        '1. 100,000+': 1,
        '0. <100,000': 0
    },
    'married_03': {
        '3. Widowed': 3,
        '1. Married or in civil union': 1,
        '4. Single': 4,
        '2. Separated or divorced': 2
    },
    'edu_gru_03': {
        '3. 7–9 years': 3,
        '1. 1–5 years': 1,
        '0. No education': 0,
        '2. 6 years': 2,
        '4. 10+ years': 4
    },
    'n_living_child_03': {
        '1. 1 or 2': 1,
        '3. 5 or 6': 3,
        '0. No children': 0,
        '2. 3 or 4': 2,
        '4. 7+': 4
    },
    'glob_hlth_03': {
        '4. Fair': 4,
        '5. Poor': 5,
        '3. Good': 3,
        '1. Excellent': 1,
        '2. Very good': 2
    },
    'employment_03': {
        '3. Dedicated to household chores': 3,
        '1. Currently Working': 1,
        '2. Currently looking for work': 2,
        '4. Retired, incapacitated, or does not work': 4
    },
    'age_12': {
        '2. 60–69': 2,
        '1. 50–59': 1,
        '4. 80+': 4,
        '3. 70–79': 3,
        '0. 49 or younger': 0
    },
    'urban_12': {
        '0. <100,000': 0,
        '1. 100,000+': 1
    },
    'married_12': {
        '1. Married or in civil union': 1,
        '3. Widowed': 3,
        '2. Separated or divorced': 2,
        '4. Single': 4
    },
    'edu_gru_12': {
        '0. No education': 0,
        '3. 7–9 years': 3,
        '1. 1–5 years': 1,
        '2. 6 years': 2,
        '4. 10+ years': 4
    },
    'n_living_child_12': {
        '1. 1 or 2': 1,
        '3. 5 or 6': 3,
        '0. No children': 0,
        '2. 3 or 4': 2,
        '4. 7+': 4
    },
    'glob_hlth_12': {
        '4. Fair': 4,
        '3. Good': 3,
        '2. Very good': 2,
        '5. Poor': 5,
        '1. Excellent': 1
    },
    'bmi_12': {
        '3. Overweight': 3,
        '4. Obese': 4,
        '2. Normal weight': 2,
        '1. Underweight': 1,
        '5. Morbidly obese': 5
    },
    'decis_famil_12': {
        '2. Approximately equal weight': 2,
        '3. Spouse': 3,
        '1. Respondent': 1
    },
    'decis_personal_12': {
        '1. A lot': 1,
        '2. A little': 2,
        '3. None': 3
    },
    'employment_12': {
        '1. Currently Working': 1,
        '2. Currently looking for work': 2,
        '3. Dedicated to household chores': 3,
        '4. Retired, incapacitated, or does not work': 4
    },
    'satis_ideal_12': {
        '3. Disagrees': 3,
        '1. Agrees': 1,
        '2. Neither agrees nor disagrees': 2
    },
    'satis_excel_12': {
        '3. Disagrees': 3,
        '2. Neither agrees nor disagrees': 2,
        '1. Agrees': 1
    },
    'satis_fine_12': {
        '1. Agrees': 1,
        '3. Disagrees': 3,
        '2. Neither agrees nor disagrees': 2
    },
    'cosas_imp_12': {
        '1. Agrees': 1,
        '2. Neither agrees nor disagrees': 2,
        '3. Disagrees': 3
    },
    'wouldnt_change_12': {
        '3. Disagrees': 3,
        '1. Agrees': 1,
        '2. Neither agrees nor disagrees': 2
    },
    'memory_12': {
        '2. Very good': 2,
        '4. Fair': 4,
        '3. Good': 3,
        '5. Poor': 5,
        '1. Excellent': 1
    },
    'ragender': {
        '1.Man': 1,
        '2.Woman': 2
    },
    'rameduc_m': {
        '1.None': 1,
        '2.Some primary': 2,
        '3.Primary': 3,
        '4.More than primary': 4
    },
    'rafeduc_m': {
        '1.None': 1,
        '2.Some primary': 2,
        '4.More than primary': 4,
        '3.Primary': 3
    },
    'sgender_03': {
        '2.Woman': 2,
        '1.Man': 1
    },
    'rrelgimp_03': {
        '1.very important': 1,
        '2.somewhat important': 2,
        '3.not important': 3
    },
    'sgender_12': {
        '2.Woman': 2,
        '1.Man': 1
    },
    'rjlocc_m_12': {
        '6.Workers in Agriculture, Livestock, Forestry, and Fishing': 6,
        '18.Safety and Security Personnel': 18,
        '8.Artisans and Workers in Production, Repair, Maintenance': 8,
        '14.Merchants and Sales Representatives': 14,
        '17.Domestic Workers': 17,
        '9.Operators of Fixed Machinery and Equipment for Ind. Production': 9,
        '15.Traveling Salespeople and Traveling Salespeople of Services': 15,
        '16.Workers in the Service Industry': 16,
        '4.Workers in Art, Shows, and Sports': 4,
        '3.Educators': 3,
        '12.Department Heads/Coordinators/Supervisors in Admin and Service Activities': 12,
        '10.Asst/Laborers etc in Ind. Production, Repair, Maintenance': 10,
        '1.Professionals': 1,
        '13.Administrative Support Staff': 13,
        '11.Drivers and Asst Drivers of Mobile Machinery and Transport Vehicles': 11,
        '7.Bosses/Supervisors etc in Artistic, Ind. Production, Repair, Maintenance Activities': 7,
        '2.Technicians': 2,
        '5.Officials and Directors Public, Private, and Social Sectors': 5
    },
    'rrelgimp_12': {
        '2.somewhat important': 2,
        '1.very important': 1,
        '3.not important': 3
    },
    'rrfcntx_m_12': {
        '9.Never': 9,
        '6.2 or 3 times a month': 6,
        '4.Once a week': 4,
        '3.2 or 3 times a week': 3,
        '8.Almost Never, sporadic': 8,
        '1.Almost every day': 1,
        '2.4 or more times a week': 2,
        '7.Once a month': 7,
        '5.4 or more times a month': 5
    },
    'rsocact_m_12': {
        '9.Never': 9,
        '1.Almost every day': 1,
        '2.4 or more times a week': 2,
        '3.2 or 3 times a week': 3,
        '4.Once a week': 4,
        '8.Almost Never, sporadic': 8,
        '5.2 or 3 times a month': 5,
        '6.Once a month': 6,
        '7.4 or more times a month': 7
    },
    'rrelgwk_12': {
        '0.No': 0,
        '1.Yes': 1
    },
    'a34_12': {
        'No 2': 0,
        'Yes 1': 1
    },
    'j11_12': {
        'Wood, mosaic, or other covering 1': 1,
        'Concrete 2': 2,
        'Mud 3': 3
    }
    }


In [140]:
# Apply mappings only to the columns that exist in the DataFrame
for column, mapping in mappings.items():
    if column in combined_data_imputed.columns:
        combined_data_imputed[column] = combined_data_imputed[column].map(mapping)
    else:
        print(f"Column {column} not found in the DataFrame, skipping mapping.")
        
# Check the updated DataFrame
combined_data_imputed.head()


Unnamed: 0,n_mar_03,migration_03,adl_dress_03,adl_walk_03,adl_bath_03,adl_eat_03,adl_bed_03,adl_toilet_03,n_adl_03,iadl_money_03,...,rjlocc_m_12,rjobend_reason_12,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a22_12,a33b_12,a34_12,j11_12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6,8.Other,2,9,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18,8.Other,1,9,1.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,2,6,2.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,2,6,2.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,4,9.0,1,Agriculture/ Animal breeding 01,Neither 3,0,2


In [141]:
combined_data_imputed.dtypes.unique()

array([dtype('float64'), dtype('O'), dtype('int64')], dtype=object)

In [142]:
combined_data_imputed.drop(columns = 'uid', inplace=True)

In [143]:
combined_data_imputed

Unnamed: 0,n_mar_03,migration_03,adl_dress_03,adl_walk_03,adl_bath_03,adl_eat_03,adl_bed_03,adl_toilet_03,n_adl_03,iadl_money_03,...,rjlocc_m_12,rjobend_reason_12,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a22_12,a33b_12,a34_12,j11_12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6,8.Other,2,9,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18,8.Other,1,9,1.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,2,6,2.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,2,6,2.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,4,9.0,1,Agriculture/ Animal breeding 01,Neither 3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5157,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,9,,0,Agriculture/ Animal breeding 01,Neither 3,0,2
5158,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,3,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
5159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16,5.Family reason,1,4,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
5160,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,9,3.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1


In [144]:

unique_values = combined_data_imputed.apply(lambda x: x.unique())


for col, unique_vals in unique_values.items():
    print(f"Column: {col} - Unique values: {unique_vals}")


Column: n_mar_03 - Unique values: [1. 2. 3. 0. 4. 5.]
Column: migration_03 - Unique values: [0. 1.]
Column: adl_dress_03 - Unique values: [0. 1.]
Column: adl_walk_03 - Unique values: [0. 1.]
Column: adl_bath_03 - Unique values: [0. 1.]
Column: adl_eat_03 - Unique values: [0. 1.]
Column: adl_bed_03 - Unique values: [0. 1.]
Column: adl_toilet_03 - Unique values: [0. 1.]
Column: n_adl_03 - Unique values: [0. 1. 2. 3. 5. 4.]
Column: iadl_money_03 - Unique values: [0. 1.]
Column: iadl_meds_03 - Unique values: [0. 1.]
Column: iadl_shop_03 - Unique values: [0. 1.]
Column: iadl_meals_03 - Unique values: [0. 1.]
Column: n_iadl_03 - Unique values: [0. 1. 3. 2. 4.]
Column: depressed_03 - Unique values: [0. 1.]
Column: hard_03 - Unique values: [0. 1.]
Column: restless_03 - Unique values: [0. 1.]
Column: happy_03 - Unique values: [1. 0.]
Column: lonely_03 - Unique values: [0. 1.]
Column: enjoy_03 - Unique values: [1. 0.]
Column: sad_03 - Unique values: [0. 1.]
Column: tired_03 - Unique values: [1. 

In [145]:
combined_data_imputed

Unnamed: 0,n_mar_03,migration_03,adl_dress_03,adl_walk_03,adl_bath_03,adl_eat_03,adl_bed_03,adl_toilet_03,n_adl_03,iadl_money_03,...,rjlocc_m_12,rjobend_reason_12,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a22_12,a33b_12,a34_12,j11_12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6,8.Other,2,9,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,18,8.Other,1,9,1.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,2,6,2.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,2,6,2.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,4,9.0,1,Agriculture/ Animal breeding 01,Neither 3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5157,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,9,,0,Agriculture/ Animal breeding 01,Neither 3,0,2
5158,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,3,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,2
5159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16,5.Family reason,1,4,9.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1
5160,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,8.Other,1,9,3.0,0,Agriculture/ Animal breeding 01,Neither 3,0,1


In [146]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Regular expression pattern to match columns with numbers and letters
pattern = r'\d+\.\s*[a-zA-Z]+'

# Step 1: Identify ordinal columns (columns with both numbers and letters)
ordinal_columns = [col for col in combined_data_imputed.columns if combined_data_imputed[col].astype(str).str.contains(pattern).any()]

# Step 2: Filter the DataFrame to select only the ordinal columns
ordinal_data = combined_data_imputed[ordinal_columns]

# Step 3: Ensure consistent data types (convert all to string and handle missing values)
for col in ordinal_columns:
    # Convert all values to string (to avoid mixing numbers and strings)
    combined_data_imputed[col] = combined_data_imputed[col].astype(str)

    # Handle missing values (replace NaN with a placeholder like 'Unknown')
    combined_data_imputed[col] = combined_data_imputed[col].fillna('Unknown')

# Step 4: Apply encoding to the ordinal columns using OrdinalEncoder
encoder = OrdinalEncoder()

# Iterate through the filtered columns and apply encoding
for col in ordinal_columns:
    # Encode the column and assign back to the original DataFrame
    combined_data_imputed[col] = encoder.fit_transform(combined_data_imputed[[col]])

# Display the first few rows of the encoded DataFrame
print(combined_data_imputed[ordinal_columns].head())



   bmi_03  decis_famil_03  rjlocc_m_03  rjobend_reason_03  rjobend_reason_12
0     2.0             1.0         16.0                3.0                4.0
1     2.0             1.0         16.0                3.0                4.0
2     2.0             1.0         16.0                3.0                4.0
3     2.0             1.0         16.0                3.0                4.0
4     2.0             1.0         16.0                2.0                4.0


In [147]:
# Convert the first few rows of the DataFrame into a list
combined_data_imputed.head().values.tolist()


[[1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  44.0,
  1995.0,
  0.0,
  0.0,
  30000.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  66.0,
  1998.0,
  0.0,
  0.0,
  140000.0,
  10000.0,
  0.0,
  0.0,
  10000.0,
  0.0,
  0.0,
  1972.0,

In [148]:
# Find columns that are not numeric
non_numeric_columns = combined_data_imputed.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)


Index(['a22_12', 'a33b_12'], dtype='object')


In [149]:
combined_data_imputed.drop(columns=['a22_12', 'a33b_12','rsocact_m_12'], inplace=True, axis = 1)

In [150]:
combined_data_imputed.head().values.tolist()

[[1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  44.0,
  1995.0,
  0.0,
  0.0,
  30000.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  66.0,
  1998.0,
  0.0,
  0.0,
  140000.0,
  10000.0,
  0.0,
  0.0,
  10000.0,
  0.0,
  0.0,
  1972.0,

In [151]:
print(combined_data_imputed.isnull().sum().sort_values(ascending=False))

n_mar_03         0
tv_12            0
rjob_hrswk_03    0
rjob_end_03      0
rearnings_03     0
                ..
iadl_shop_12     0
iadl_meals_12    0
n_iadl_12        0
depressed_12     0
j11_12           0
Length: 182, dtype: int64


In [152]:
X = combined_data_imputed.drop(columns=['composite_score'])  # Drop target and any non-feature columns
y = combined_data_imputed['composite_score']

In [153]:
X

Unnamed: 0,n_mar_03,migration_03,adl_dress_03,adl_walk_03,adl_bath_03,adl_eat_03,adl_bed_03,adl_toilet_03,n_adl_03,iadl_money_03,...,rjobend_reason_03,rrelgimp_03,sgender_12,rjlocc_m_12,rjobend_reason_12,rrelgimp_12,rrfcntx_m_12,rrelgwk_12,a34_12,j11_12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,2,6,4.0,2,9,0,0,2
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,2,18,4.0,1,9,0,0,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,1,8,4.0,2,6,0,0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,1,8,4.0,2,6,0,0,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1,2,8,4.0,1,4,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5157,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,1,8,4.0,1,9,0,0,2
5158,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,1,8,4.0,1,3,0,0,2
5159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,2,16,3.0,1,4,0,0,1
5160,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1,1,8,4.0,1,9,0,0,1


In [155]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [157]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_regression, f_regression
from sklearn.model_selection import train_test_split
from collections import Counter

# Scale data using MinMaxScaler to keep everything in the range [0, 1] for chi2
min_max_scaler = MinMaxScaler()
X_min_max_scaled = min_max_scaler.fit_transform(X)

# Apply SelectKBest with chi2 on the scaled data
chi2_selector = SelectKBest(chi2, k=10)
X_chi2 = chi2_selector.fit_transform(X_min_max_scaled, y)
chi2_selected_features = X.columns[chi2_selector.get_support()].tolist()

# Apply SelectKBest with mutual_info_regression on the scaled data
mi_selector = SelectKBest(mutual_info_regression, k=10)
X_mi = mi_selector.fit_transform(X_min_max_scaled, y)
mi_selected_features = X.columns[mi_selector.get_support()].tolist()

# Apply SelectKBest with f_regression on the scaled data
f_selector = SelectKBest(f_regression, k=10)
X_f = f_selector.fit_transform(X_min_max_scaled, y)
f_selected_features = X.columns[f_selector.get_support()].tolist()

# Combine selected features from all methods
all_selected_features = chi2_selected_features + mi_selected_features + f_selected_features

# Count the most common features
feature_counts = Counter(all_selected_features)
most_common_features = [feature for feature, count in feature_counts.most_common(10)]

print("Most Common 10 Selected Features:", most_common_features)



Most Common 10 Selected Features: ['edu_gru_12', 'age_03', 'edu_gru_03', 'age_12', 'n_living_child_12', 'rameduc_m', 'adl_walk_03', 'iadl_money_03', 'adl_walk_12', 'iadl_money_12']


In [158]:
# Use these most common features for further modeling
X_selected = X[most_common_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [159]:
X_train

Unnamed: 0,edu_gru_12,age_03,edu_gru_03,age_12,n_living_child_12,rameduc_m,adl_walk_03,iadl_money_03,adl_walk_12,iadl_money_12
4669,1,2,1,3,4,2,0.0,0.0,0.0,0.0
4709,0,2,0,3,4,1,0.0,0.0,0.0,1.0
4183,1,3,1,4,1,1,0.0,0.0,0.0,0.0
1670,1,3,1,4,4,1,0.0,0.0,0.0,0.0
99,1,1,1,1,3,1,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4426,1,1,1,2,4,1,0.0,0.0,0.0,0.0
466,2,1,1,1,1,1,0.0,0.0,0.0,0.0
3092,2,3,2,4,4,2,0.0,0.0,0.0,0.0
3772,1,1,1,1,2,1,0.0,0.0,0.0,1.0


In [160]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from prophet import Prophet

In [161]:
models_new_results = []

In [162]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)

lr_results = {
    'Model': 'Linear Regression',
    'MAE': mean_absolute_error(y_test, lr_y_pred),
    'MSE': mean_squared_error(y_test, lr_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lr_y_pred)),
    'R-squared': r2_score(y_test, lr_y_pred)
}
models_new_results.append(lr_results)

In [164]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [167]:
# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Ridge Regression
ridge_model = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 10]}
ridge_grid_search = GridSearchCV(ridge_model, ridge_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
ridge_grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
ridge_best_model = ridge_grid_search.best_estimator_

# Predict on the test set
ridge_y_pred = ridge_best_model.predict(X_test)

# Calculate performance metrics
ridge_results = {
    'Model': 'Ridge',
    'Best Parameters': ridge_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, ridge_y_pred),
    'MSE': mean_squared_error(y_test, ridge_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, ridge_y_pred)),
    'R-squared': r2_score(y_test, ridge_y_pred)
}

# Append the results to the list
models_new_results.append(ridge_results)

# Optionally, print the results
print(ridge_results)

{'Model': 'Ridge', 'Best Parameters': {'alpha': 10}, 'MAE': 32.981869926396165, 'MSE': 1745.5182216459632, 'RMSE': 41.77939948881462, 'R-squared': 0.40514972920686465}


In [168]:
# Lasso Regression
lasso_model = Lasso()
lasso_param_grid = {'alpha': [0.1, 1.0, 10]}
lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
lasso_best_model = lasso_grid_search.best_estimator_
lasso_y_pred = lasso_best_model.predict(X_test)

lasso_results = {
    'Model': 'Lasso',
    'Best Parameters': lasso_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, lasso_y_pred),
    'MSE': mean_squared_error(y_test, lasso_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lasso_y_pred)),
    'R-squared': r2_score(y_test, lasso_y_pred)
}
models_new_results.append(lasso_results)

In [169]:
# ElasticNet Regression
en_model = ElasticNet()
en_param_grid = {'alpha': [0.1, 1.0, 10], 'l1_ratio': [0.2, 0.5, 0.8]}
en_grid_search = GridSearchCV(en_model, en_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
en_grid_search.fit(X_train, y_train)
en_best_model = en_grid_search.best_estimator_
en_y_pred = en_best_model.predict(X_test)

en_results = {
    'Model': 'ElasticNet',
    'Best Parameters': en_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, en_y_pred),
    'MSE': mean_squared_error(y_test, en_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, en_y_pred)),
    'R-squared': r2_score(y_test, en_y_pred)
}
models_new_results.append(en_results)

In [171]:
# Random Forest Regressor
rf_model = RandomForestRegressor()
rf_param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)

rf_results = {
    'Model': 'Random Forest',
    'Best Parameters': rf_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, rf_y_pred),
    'MSE': mean_squared_error(y_test, rf_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_y_pred)),
    'R-squared': r2_score(y_test, rf_y_pred)
}
models_new_results.append(rf_results)

In [264]:
# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor()
gb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_y_pred = gb_best_model.predict(X_test)

gb_results = {
    'Model': 'Gradient Boosting',
    'Best Parameters': gb_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, gb_y_pred),
    'MSE': mean_squared_error(y_test, gb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, gb_y_pred)),
    'R-squared': r2_score(y_test, gb_y_pred)
}
models_new_results.append(gb_results)

In [265]:
# XGBoost Regression
xgb_model = XGBRegressor()
xgb_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1]}
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_y_pred = xgb_best_model.predict(X_test)

xgb_results = {
    'Model': 'XGBoost',
    'Best Parameters': xgb_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, xgb_y_pred),
    'MSE': mean_squared_error(y_test, xgb_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, xgb_y_pred)),
    'R-squared': r2_score(y_test, xgb_y_pred)
}
models_new_results.append(xgb_results)

In [266]:
from lightgbm import LGBMRegressor

In [267]:
# LightGBM Regression
lgbm_model = LGBMRegressor()
lgbm_param_grid = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
lgbm_grid_search.fit(X_train, y_train)
lgbm_best_model = lgbm_grid_search.best_estimator_
lgbm_y_pred = lgbm_best_model.predict(X_test)

lgbm_results = {
    'Model': 'LightGBM',
    'Best Parameters': lgbm_grid_search.best_params_,
    'MAE': mean_absolute_error(y_test, lgbm_y_pred),
    'MSE': mean_squared_error(y_test, lgbm_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lgbm_y_pred)),
    'R-squared': r2_score(y_test, lgbm_y_pred)
}
models_new_results.append(lgbm_results)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32
[LightGBM] [Info] Number of data points in the train set: 8259, number of used features: 10
[LightGBM] [Info] Start training from score 157.385882


In [268]:
# CatBoost Regression
catboost_model = CatBoostRegressor(verbose=0)
catboost_model.fit(X_train, y_train)
catboost_y_pred = catboost_model.predict(X_test)

catboost_results = {
    'Model': 'CatBoost',
    'MAE': mean_absolute_error(y_test, catboost_y_pred),
    'MSE': mean_squared_error(y_test, catboost_y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, catboost_y_pred)),
    'R-squared': r2_score(y_test, catboost_y_pred)
}
models_new_results.append(catboost_results)

In [172]:
models_new_results

[{'Model': 'Linear Regression',
  'MAE': 33.02667097485728,
  'MSE': 1749.332552258816,
  'RMSE': 41.82502303954914,
  'R-squared': 0.4038498541498112},
 {'Model': 'Ridge',
  'Best Parameters': {'alpha': 10},
  'MAE': 32.981869926396165,
  'MSE': 1745.5182216459632,
  'RMSE': 41.77939948881462,
  'R-squared': 0.40514972920686465},
 {'Model': 'Lasso',
  'Best Parameters': {'alpha': 0.1},
  'MAE': 32.961992895233124,
  'MSE': 1744.682008896347,
  'RMSE': 41.7693908130864,
  'R-squared': 0.4054346998100823},
 {'Model': 'ElasticNet',
  'Best Parameters': {'alpha': 0.1, 'l1_ratio': 0.8},
  'MAE': 33.02419762099,
  'MSE': 1745.5894382056972,
  'RMSE': 41.78025177288544,
  'R-squared': 0.4051254595147369},
 {'Model': 'Random Forest',
  'Best Parameters': {'max_depth': 5, 'n_estimators': 100},
  'MAE': 32.806171430221035,
  'MSE': 1670.9827353183682,
  'RMSE': 40.87765569743901,
  'R-squared': 0.4305504690420864}]

In [173]:
import pandas as pd

# Ensure models_new_results is a DataFrame
models_new_results = pd.DataFrame(models_new_results)  # If models_new_results was initially a list of dictionaries


In [174]:
# Group by model name and select the row with the best R-squared or lowest MAE
best_results = models_new_results.loc[models_new_results.groupby('Model')['RMSE'].idxmax()]

print(best_results)


               Model        MAE          MSE       RMSE  R-squared  \
3         ElasticNet  33.024198  1745.589438  41.780252   0.405125   
2              Lasso  32.961993  1744.682009  41.769391   0.405435   
0  Linear Regression  33.026671  1749.332552  41.825023   0.403850   
4      Random Forest  32.806171  1670.982735  40.877656   0.430550   
1              Ridge  32.981870  1745.518222  41.779399   0.405150   

                         Best Parameters  
3        {'alpha': 0.1, 'l1_ratio': 0.8}  
2                         {'alpha': 0.1}  
0                                    NaN  
4  {'max_depth': 5, 'n_estimators': 100}  
1                          {'alpha': 10}  


In [175]:
import pickle

In [177]:


# Save the trained model to a file
with open('final_ridge_model.pkl', 'wb') as file:
    pickle.dump(ridge_best_model, file)


In [279]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

In [178]:
X_train.columns

Index(['edu_gru_12', 'age_03', 'edu_gru_03', 'age_12', 'n_living_child_12',
       'rameduc_m', 'adl_walk_03', 'iadl_money_03', 'adl_walk_12',
       'iadl_money_12'],
      dtype='object')

In [179]:
feature_names = list(ridge_best_model.feature_names_in_)
print(feature_names)


['edu_gru_12', 'age_03', 'edu_gru_03', 'age_12', 'n_living_child_12', 'rameduc_m', 'adl_walk_03', 'iadl_money_03', 'adl_walk_12', 'iadl_money_12']
