In [None]:
import time
start_time = time.time()

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Custom transformers and base classes
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing and feature engineering
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Imputation
from sklearn.experimental import enable_iterative_imputer  # Enable experimental features
from sklearn.impute import IterativeImputer, SimpleImputer

# Feature selection
from sklearn.feature_selection import SelectKBest, f_regression

# Model selection and evaluation
from sklearn.model_selection import GridSearchCV, GroupKFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR


# Machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (
    StackingRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor
)
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

# Model interpretation
import shap

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns

## Loading data and merging features and labels

In [None]:
# Load data
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')

# Merge features and labels
data = train_labels.merge(train_features, on='uid', how='left')

data.head()


Unnamed: 0,uid,year,composite_score,age_03,urban_03,married_03,n_mar_03,edu_gru_03,n_living_child_03,migration_03,...,rrelgimp_12,rrfcntx_m_12,rsocact_m_12,rrelgwk_12,a16a_12,a21_12,a22_12,a33b_12,a34_12,j11_12
0,aace,2021,175,,,,,,,,...,2.somewhat important,9.Never,9.Never,0.No,,,,,,Concrete 2
1,aanz,2021,206,,,,,,,,...,1.very important,9.Never,1.Almost every day,0.No,,,,,,Concrete 2
2,aape,2016,161,,,,,,,,...,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"
3,aape,2021,144,,,,,,,,...,2.somewhat important,6.2 or 3 times a month,2.4 or more times a week,0.No,,,,,,"Wood, mosaic, or other covering 1"
4,aard,2021,104,1. 50–59,"1. 100,000+",3. Widowed,1.0,3. 7–9 years,1. 1 or 2,0.0,...,1.very important,4.Once a week,9.Never,1.Yes,,,,,No 2,Concrete 2


## Dropping columns with excessive missing data
Defined as more than 50% of values missing

In [None]:
# Drop columns with excessive missing data
missing_threshold = 0.5
missing_percentages = data.isnull().mean()
cols_to_drop = missing_percentages[missing_percentages > missing_threshold].index.tolist()
data = data.drop(columns=cols_to_drop, errors='ignore')

## Identifying numerical and categorical columns

In [None]:
# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

## Separating ordinal and nominal categorical columns

In [None]:
# Identify ordinal variables based on data descriptions
ordinal_variables = [
    'age_03',              # Binned age group
    'edu_gru_03',          # Binned education level
    'n_living_child_03',   # Binned number of living children
    'glob_hlth_03',        # Self-reported global health
    'age_12',              # Binned age group
    'edu_gru_12',          # Binned education level
    'n_living_child_12',   # Binned number of living children
    'glob_hlth_12',        # Self-reported global health
    'bmi_12',              # Binned body mass index
    'decis_famil_12',      # Weight in family decisions
    'decis_personal_12',   # Weight over personal decisions
    'satis_ideal_12',      # Agreement with life satisfaction statements
    'satis_excel_12',
    'satis_fine_12',
    'cosas_imp_12',
    'wouldnt_change_12',
    'memory_12',           # Self-reported memory
    'rameduc_m',           # Mother's education level
    'rafeduc_m',           # Father's education level
    'rrelgimp_03',         # Importance of religion
    'rrelgimp_12',
    'rrfcntx_m_12',        # Frequency of seeing friends/relatives
    'rsocact_m_12',        # Frequency of social activities
    'rrelgwk_12',          # Participation in weekly religious services
    'a34_12'               # English proficiency
]

# Nominal variables are the rest of the categorical variables
nominal_variables = [col for col in categorical_cols if col not in ordinal_variables]

# Exclude 'uid' from nominal variables
if 'uid' in nominal_variables:
    nominal_variables.remove('uid')

# Output the lists
print(f"Ordinal variables ({len(ordinal_variables)}): {ordinal_variables}\n")
print(f"Nominal variables ({len(nominal_variables)}): {nominal_variables}\n")

Ordinal variables (25): ['age_03', 'edu_gru_03', 'n_living_child_03', 'glob_hlth_03', 'age_12', 'edu_gru_12', 'n_living_child_12', 'glob_hlth_12', 'bmi_12', 'decis_famil_12', 'decis_personal_12', 'satis_ideal_12', 'satis_excel_12', 'satis_fine_12', 'cosas_imp_12', 'wouldnt_change_12', 'memory_12', 'rameduc_m', 'rafeduc_m', 'rrelgimp_03', 'rrelgimp_12', 'rrfcntx_m_12', 'rsocact_m_12', 'rrelgwk_12', 'a34_12']

Nominal variables (11): ['urban_03', 'married_03', 'employment_03', 'urban_12', 'married_12', 'employment_12', 'ragender', 'sgender_03', 'sgender_12', 'rjlocc_m_12', 'j11_12']



### Creating a custom mapping for ordinal variables

In [None]:
# Mappings for ordinal variables
age_mapping = {
    '0. 49 or younger': 0,
    '1. 50–59': 1,
    '2. 60–69': 2,
    '3. 70–79': 3,
    '4. 80+': 4,
}

education_mapping = {
    '0. No education': 0,
    '1. 1–5 years': 1,
    '2. 6 years': 2,
    '3. 7–9 years': 3,
    '4. 10+ years': 4,
}

n_living_child_mapping = {
    '0. No children': 0,
    '1. 1 or 2': 1,
    '2. 3 or 4': 2,
    '3. 5 or 6': 3,
    '4. 7+': 4,
}

glob_health_mapping = {
    '1. Excellent': 5,
    '2. Very good': 4,
    '3. Good': 3,
    '4. Fair': 2,
    '5. Poor': 1,
}

bmi_mapping = {
    '1. Underweight': 1,
    '2. Normal weight': 2,
    '3. Overweight': 3,
    '4. Obese': 4,
    '5. Morbidly obese': 5,
}

decis_famil_mapping = {
    '1. Respondent': 1,
    '2. Approximately equal weight': 2,
    '3. Spouse': 3,
}

decis_personal_mapping = {
    '1. A lot': 3,
    '2. A little': 2,
    '3. None': 1
}

agreement_mapping = {
    '1. Agrees': 3,
    '2. Neither agrees nor disagrees': 2,
    '3. Disagrees': 1,
}

memory_mapping = {
    '1. Excellent': 5,
    '2. Very good': 4,
    '3. Good': 3,
    '4. Fair': 2,
    '5. Poor': 1,
}

parent_education_mapping = {
    '1.None': 1,
    '2.Some primary': 2,
    '3.Primary': 3,
    '4.More than primary': 4,
}

religion_importance_mapping = {
    '1.very important': 3,
    '2.somewhat important': 2,
    '3.not important': 1,
}

frequency_mapping = {
    '1.Almost every day': 9,
    '2.4 or more times a week': 8,
    '3.2 or 3 times a week': 7,
    '4.Once a week': 6,
    '5.4 or more times a month': 5,
    '6.2 or 3 times a month': 4,
    '7.Once a month': 3,
    '8.Almost Never, sporadic': 2,
    '9.Never': 1,
}

religious_services_mapping = {
    '1.Yes': 1,
    '0.No': 0,
}

english_proficiency_mapping = {
    'Yes 1': 1,
    'No 2': 0,
}

# Compile all mappings into a dictionary for easy access
ordinal_mappings = {
    'age_03': age_mapping,
    'age_12': age_mapping,
    'edu_gru_03': education_mapping,
    'edu_gru_12': education_mapping,
    'n_living_child_03': n_living_child_mapping,
    'n_living_child_12': n_living_child_mapping,
    'glob_hlth_03': glob_health_mapping,
    'glob_hlth_12': glob_health_mapping,
    'bmi_12': bmi_mapping,
    'decis_famil_12': decis_famil_mapping,
    'decis_personal_12': decis_personal_mapping,
    'satis_ideal_12': agreement_mapping,
    'satis_excel_12': agreement_mapping,
    'satis_fine_12': agreement_mapping,
    'cosas_imp_12': agreement_mapping,
    'wouldnt_change_12': agreement_mapping,
    'memory_12': memory_mapping,
    'rameduc_m': parent_education_mapping,
    'rafeduc_m': parent_education_mapping,
    'rrelgimp_03': religion_importance_mapping,
    'rrelgimp_12': religion_importance_mapping,
    'rrfcntx_m_12': frequency_mapping,
    'rsocact_m_12': frequency_mapping,
    'rrelgwk_12': religious_services_mapping,
    'a34_12': english_proficiency_mapping,
}


## Feature engineering

### 1. Creating Temporal Features:
- Change Over Time: For individuals with data from both 2003 and 2012, calculate the change or rate of change in features over time.
- Duration Since Last Measurement: Include the time gap between the last available feature data and the target year.

#### Identify Features for Change Calculation
We'll focus on numerical and ordinal variables suitable for calculating changes.

In [None]:
# Adjust the temporal features handling to match existing columns
common_features = set([col[:-3] for col in data.columns if col.endswith('_03')]) & \
                  set([col[:-3] for col in data.columns if col.endswith('_12')])

# Filter common features that are numerical or ordinal
numerical_common_features = [feature for feature in common_features if feature + '_03' in numerical_cols or feature + '_12' in numerical_cols]

# Redefine temporal features to only include those present in the dataset
temporal_features = [feature + '_change' for feature in numerical_common_features] + ['time_gap']

# Ensure numerical_common_features and temporal features are aligned with the dataset
numerical_common_features = [feature for feature in numerical_common_features if feature + '_03' in data.columns and feature + '_12' in data.columns]

# Verify adjusted temporal features
temporal_features, numerical_common_features


(['sad_change',
  'cancer_change',
  'depressed_change',
  'hypertension_change',
  'iadl_shop_change',
  'hard_change',
  'happy_change',
  'iadl_money_change',
  'out_proc_change',
  'restless_change',
  'hincome_change',
  'adl_eat_change',
  'rinc_pension_change',
  'arthritis_change',
  'rearnings_change',
  'adl_dress_change',
  'visit_med_change',
  'iadl_meals_change',
  'cesd_depressed_change',
  'insur_other_change',
  'exer_3xwk_change',
  'test_pres_change',
  'stroke_change',
  'adl_bath_change',
  'decis_personal_change',
  'visit_dental_change',
  'insured_change',
  'adl_walk_change',
  'pem_def_mar_change',
  'diabetes_change',
  'test_diab_change',
  'imss_change',
  'alcohol_change',
  'migration_change',
  'iadl_meds_change',
  'test_tuber_change',
  'hosp_change',
  'adl_toilet_change',
  'adl_bed_change',
  'n_depr_change',
  'hinc_cap_change',
  'n_adl_change',
  'tired_change',
  'hinc_assets_change',
  'sinc_pension_change',
  'issste_change',
  'n_iadl_change'

### Creating custom transformers for the temporal features' creation

In [None]:
class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_common_features, ordinal_mappings):
        self.numerical_common_features = numerical_common_features
        self.ordinal_mappings = ordinal_mappings
        
    def fit(self, X, y=None):
        return self  # Nothing to fit

    def transform(self, X):
        X = X.copy()
        
        # Handle ordinal variables
        for feature in self.numerical_common_features:
            base_feature = feature  # e.g., 'age', 'edu_gru'
            col_03 = feature + '_03'
            col_12 = feature + '_12'
            change_col = feature + '_change'

            # Map ordinal variables if necessary
            if base_feature in self.ordinal_mappings:
                mapping = self.ordinal_mappings[base_feature]
                if col_03 in X.columns:
                    X[col_03] = X[col_03].map(mapping)
                if col_12 in X.columns:
                    X[col_12] = X[col_12].map(mapping)

            # Convert columns to numeric (if not already)
            if col_03 in X.columns:
                X[col_03] = pd.to_numeric(X[col_03], errors='coerce')
            if col_12 in X.columns:
                X[col_12] = pd.to_numeric(X[col_12], errors='coerce')

            # Calculate change: 2012 value - 2003 value
            if col_03 in X.columns and col_12 in X.columns:
                X[change_col] = X[col_12] - X[col_03]
            else:
                X[change_col] = np.nan  # Handle missing columns
        
        # Determine last feature year
        X['last_feature_year'] = X.apply(self.get_last_feature_year, axis=1)

        # Calculate time gap
        X['time_gap'] = X['year'] - X['last_feature_year']
        
        # Drop 'last_feature_year' if not needed
        X.drop(columns=['last_feature_year'], inplace=True)
        
        return X

    @staticmethod
    def get_last_feature_year(row):
        if not pd.isnull(row.get('age_12')):
            return 2012
        elif not pd.isnull(row.get('age_03')):
            return 2003
        else:
            return np.nan  # No data available


### More custom transformers for feature engineering based off of domain knowledge

#### Education

In [None]:
# Education Progression
class EducationProgressionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self  # Nothing to fit
    
    def transform(self, X):
        X = X.copy()
        if 'edu_gru_03' in X.columns and 'edu_gru_12' in X.columns:
            X['education_transition'] = X['edu_gru_12'] - X['edu_gru_03']
        else:
            X['education_transition'] = np.nan
        return X

#### Marital status

In [None]:
# Marital status columns
married_cols_03 = [col for col in data.columns if 'married_03' in col]
married_cols_12 = [col for col in data.columns if 'married_12' in col]

In [None]:
# Marital status stability
class MaritalTransitionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, married_cols_03, married_cols_12):
        self.married_cols_03 = married_cols_03
        self.married_cols_12 = married_cols_12
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        if self.married_cols_03 and self.married_cols_12:
            X['marital_transition'] = (
                X[self.married_cols_03].sum(axis=1) != X[self.married_cols_12].sum(axis=1)
            ).astype(int)
        else:
            X['marital_transition'] = 0
        return X


#### Chronic Illnesses

In [None]:
# Chronic illness columns
chronic_illness_cols_03 = ['hypertension_03', 'diabetes_03', 'resp_ill_03', 'arthritis_03', 'hrt_attack_03', 'stroke_03', 'cancer_03']
chronic_illness_cols_12 = ['hypertension_12', 'diabetes_12', 'resp_ill_12', 'arthritis_12', 'hrt_attack_12', 'stroke_12', 'cancer_12']

In [None]:
# Count of chronic illnesses
class ChronicIllnessTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, chronic_illness_cols_03, chronic_illness_cols_12):
        self.chronic_illness_cols_03 = chronic_illness_cols_03
        self.chronic_illness_cols_12 = chronic_illness_cols_12
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Sum chronic illnesses
        X['chronic_illness_count_03'] = X[self.chronic_illness_cols_03].sum(axis=1)
        X['chronic_illness_count_12'] = X[self.chronic_illness_cols_12].sum(axis=1)
        return X


#### Limitations of activities of daily living

In [None]:
# ADL and IADL columns
adl_cols_03 = ['adl_dress_03', 'adl_walk_03', 'adl_bath_03', 'adl_eat_03', 'adl_bed_03', 'adl_toilet_03']
adl_cols_12 = ['adl_dress_12', 'adl_walk_12', 'adl_bath_12', 'adl_eat_12', 'adl_bed_12', 'adl_toilet_12']
iadl_cols_03 = ['iadl_money_03', 'iadl_meds_03', 'iadl_shop_03', 'iadl_meals_03']
iadl_cols_12 = ['iadl_money_12', 'iadl_meds_12', 'iadl_shop_12', 'iadl_meals_12']

In [None]:
# Limitations of Activities of daily living count and progression
class ADLIADLTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, adl_cols_03, adl_cols_12, iadl_cols_03, iadl_cols_12):
        self.adl_cols_03 = adl_cols_03
        self.adl_cols_12 = adl_cols_12
        self.iadl_cols_03 = iadl_cols_03
        self.iadl_cols_12 = iadl_cols_12
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Sum ADL limitations
        X['total_adl_limitations_03'] = X[self.adl_cols_03].sum(axis=1)
        X['total_adl_limitations_12'] = X[self.adl_cols_12].sum(axis=1)
        # Sum IADL limitations
        X['total_iadl_limitations_03'] = X[self.iadl_cols_03].sum(axis=1)
        X['total_iadl_limitations_12'] = X[self.iadl_cols_12].sum(axis=1)
        # Calculate progression
        X['adl_iadl_progression'] = (
            (X['total_adl_limitations_12'] + X['total_iadl_limitations_12']) -
            (X['total_adl_limitations_03'] + X['total_iadl_limitations_03'])
        )
        return X


#### Self Reported Health

In [None]:
# Self Reported Health Change
class HealthAssessmentChangeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        if 'glob_hlth_03' in X.columns and 'glob_hlth_12' in X.columns:
            X['health_self_assessment_change'] = X['glob_hlth_12'] - X['glob_hlth_03']
        else:
            X['health_self_assessment_change'] = np.nan
        return X


#### Mood and depressive sypmtoms

In [None]:
# Define mood columns
positive_mood_cols_03 = ['happy_03', 'enjoy_03', 'energetic_03']
positive_mood_cols_12 = ['happy_12', 'enjoy_12', 'energetic_12']
negative_mood_cols_03 = ['depressed_03', 'restless_03', 'lonely_03', 'sad_03', 'tired_03']
negative_mood_cols_12 = ['depressed_12', 'restless_12', 'lonely_12', 'sad_12', 'tired_12']


In [None]:
# Custom transformer to engineer positive and negative mood scores
class MoodScoreTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, positive_mood_cols_03, positive_mood_cols_12, negative_mood_cols_03, negative_mood_cols_12):
        self.positive_mood_cols_03 = positive_mood_cols_03
        self.positive_mood_cols_12 = positive_mood_cols_12
        self.negative_mood_cols_03 = negative_mood_cols_03
        self.negative_mood_cols_12 = negative_mood_cols_12
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Create aggregate scores for positive and negative moods in 2003
        X['positive_mood_score_03'] = X[self.positive_mood_cols_03].sum(axis=1)
        X['negative_mood_score_03'] = X[self.negative_mood_cols_03].sum(axis=1)
        
        # Create aggregate scores for positive and negative moods in 2012
        X['positive_mood_score_12'] = X[self.positive_mood_cols_12].sum(axis=1)
        X['negative_mood_score_12'] = X[self.negative_mood_cols_12].sum(axis=1)
        
        # Calculate mood changes over time
        X['positive_mood_change'] = X['positive_mood_score_12'] - X['positive_mood_score_03']
        X['negative_mood_change'] = X['negative_mood_score_12'] - X['negative_mood_score_03']
        
        return X

#### Exercise

In [None]:
# Consistency of exercise tracking
class ConsistentExerciseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        if 'exer_3xwk_03' in X.columns and 'exer_3xwk_12' in X.columns:
            X['consistent_exercise'] = ((X['exer_3xwk_03'] == 1) & (X['exer_3xwk_12'] == 1)).astype(int)
        else:
            X['consistent_exercise'] = np.nan
        return X


#### Alcohol and smoking history

In [None]:
# Lifestyle columns
lifestyle_cols_03 = ['alcohol_03', 'tobacco_03']
lifestyle_cols_12 = ['alcohol_12', 'tobacco_12']

In [None]:
# Alcohol and smoking tracking
class LifestyleHealthIndexTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lifestyle_cols_03, lifestyle_cols_12):
        self.lifestyle_cols_03 = lifestyle_cols_03
        self.lifestyle_cols_12 = lifestyle_cols_12
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['lifestyle_health_index_03'] = X[self.lifestyle_cols_03].sum(axis=1)
        X['lifestyle_health_index_12'] = X[self.lifestyle_cols_12].sum(axis=1)
        return X


#### Income and insurance


In [None]:
# Income columns
income_cols_03 = ['rearnings_03', 'searnings_03', 'hincome_03', 'hinc_business_03', 'hinc_rent_03', 'hinc_assets_03', 'hinc_cap_03', 'rinc_pension_03', 'sinc_pension_03']
income_cols_12 = ['rearnings_12', 'searnings_12', 'hincome_12', 'hinc_business_12', 'hinc_rent_12', 'hinc_assets_12', 'hinc_cap_12', 'rinc_pension_12', 'sinc_pension_12']

# Insurance columns
insurance_cols_03 = ['imss_03', 'issste_03', 'pem_def_mar_03', 'insur_private_03', 'insur_other_03', 'insured_03']
insurance_cols_12 = ['imss_12', 'issste_12', 'pem_def_mar_12', 'insur_private_12', 'insur_other_12', 'insured_12']


In [None]:
# Income and insurance coverage
class SocioeconomicFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, income_cols_03, income_cols_12, insurance_cols_03, insurance_cols_12):
        self.income_cols_03 = income_cols_03
        self.income_cols_12 = income_cols_12
        self.insurance_cols_03 = insurance_cols_03
        self.insurance_cols_12 = insurance_cols_12
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Aggregate income
        X['aggregate_income_03'] = X[self.income_cols_03].sum(axis=1)
        X['aggregate_income_12'] = X[self.income_cols_12].sum(axis=1)
        # Insurance coverage depth
        X['insurance_coverage_depth_03'] = X[self.insurance_cols_03].sum(axis=1)
        X['insurance_coverage_depth_12'] = X[self.insurance_cols_12].sum(axis=1)
        # Insurance continuity
        X['insurance_continuity'] = ((X['insurance_coverage_depth_03'] > 0) & (X['insurance_coverage_depth_12'] > 0)).astype(int)
        return X


#### Social Engagement

In [None]:
# Define social engagement columns
social_engagement_cols = [
    'attends_class_12', 'attends_club_12', 'reads_12', 'games_12', 'table_games_12',
    'comms_tel_comp_12', 'tv_12', 'sewing_12', 'act_mant_12',
    'volunteer_12', 'care_adult_12', 'care_child_12',
    'rrfcntx_m_12', 'rsocact_m_12', 'rrelgwk_12'
]


In [None]:
# Social Engagement score
class SocialEngagementTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, social_engagement_cols):
        self.social_engagement_cols = social_engagement_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['social_engagement_12'] = X[self.social_engagement_cols].sum(axis=1)
        return X


In [None]:
class SocialEngagementTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, social_engagement_cols):
        self.social_engagement_cols = social_engagement_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Convert columns to numeric, coercing errors to NaN
        X[self.social_engagement_cols] = X[self.social_engagement_cols].apply(pd.to_numeric, errors='coerce')
        # Fill NaN values with 0
        X[self.social_engagement_cols] = X[self.social_engagement_cols].fillna(0)
        # Sum the engagement activities
        X['social_engagement_12'] = X[self.social_engagement_cols].sum(axis=1)
        return X


#### Healthcare

In [None]:
# Preventive care and health service usage columns
preventive_care_cols_03 = ['test_chol_03', 'test_tuber_03', 'test_diab_03', 'test_pres_03']
preventive_care_cols_12 = ['test_chol_12', 'test_tuber_12', 'test_diab_12', 'test_pres_12']
health_service_usage_cols_03 = ['visit_med_03', 'out_proc_03', 'visit_dental_03']
health_service_usage_cols_12 = ['visit_med_12', 'out_proc_12', 'visit_dental_12', 'hosp_12']


In [None]:
# Custom transformer to create preventive care index and health services usage
class HealthServicesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, preventive_care_cols_03, preventive_care_cols_12, health_service_usage_cols_03, health_service_usage_cols_12):
        self.preventive_care_cols_03 = preventive_care_cols_03
        self.preventive_care_cols_12 = preventive_care_cols_12
        self.health_service_usage_cols_03 = health_service_usage_cols_03
        self.health_service_usage_cols_12 = health_service_usage_cols_12

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create preventive care index for 2003 and 2012
        X['preventive_care_index_03'] = X[self.preventive_care_cols_03].sum(axis=1)
        X['preventive_care_index_12'] = X[self.preventive_care_cols_12].sum(axis=1)
        
        # Create health service usage score for 2003 and 2012
        X['health_service_usage_03'] = X[self.health_service_usage_cols_03].sum(axis=1)
        X['health_service_usage_12'] = X[self.health_service_usage_cols_12].sum(axis=1)
        
        # Calculate changes between 2003 and 2012
        X['preventive_care_change'] = X['preventive_care_index_12'] - X['preventive_care_index_03']
        X['health_service_usage_change'] = X['health_service_usage_12'] - X['health_service_usage_03']
        
        return X


## Splitting data into training, validation and testing sets

In [None]:
from sklearn.model_selection import GroupKFold

def split_data(data, features, target):
    # Define groups based on 'uid'
    groups = data['uid']
    
    # Prepare the data
    X = data[features]
    y = data[target]
    
    # Initialize GroupKFold
    gkf = GroupKFold(n_splits=5)
    
    # Generate indices for splits
    splits = gkf.split(X, y, groups=groups)
    
    # For demonstration, take the first split
    train_idx, test_idx = next(splits)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Split test into validation and test sets
    val_size = int(len(X_test) / 2)
    X_val = X_test.iloc[:val_size]
    y_val = y_test.iloc[:val_size]
    X_test = X_test.iloc[val_size:]
    y_test = y_test.iloc[val_size:]
    
    return X_train, X_val, X_test, y_train, y_val, y_test


## Preprocesssing

In [None]:
# Function to map ordinal variables
def map_ordinal_variables(X, ordinal_cols, ordinal_mappings):
    X = X.copy()
    for col in ordinal_cols:
        if col in X.columns:
            mapping = ordinal_mappings.get(col, {})
            X[col] = X[col].map(mapping)
    return X


In [None]:
# Custom Transformer to remove outliers
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns  # List of numerical columns for outlier removal

    def fit(self, X, y=None):
        # Filter for only numeric columns within the provided columns
        self.numeric_columns = X[self.columns].select_dtypes(include=['number']).columns
        
        # Compute Q1 and Q3 for each column
        self.Q1 = X[self.columns].quantile(0.25)
        self.Q3 = X[self.columns].quantile(0.75)
        self.IQR = self.Q3 - self.Q1
        return self

    def transform(self, X, y=None):
        # Identify rows without outliers
        mask = ~((X[self.columns] < (self.Q1 - 1.5 * self.IQR)) |
                 (X[self.columns] > (self.Q3 + 1.5 * self.IQR))).any(axis=1)
        # Return X and y where outliers are removed
        if y is not None:
            return X[mask], y[mask]
        return X[mask]

In [None]:
# Create a custom transformer that applies all custom transformations
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_common_features, ordinal_mappings, married_cols_03, married_cols_12,
                 chronic_illness_cols_03, chronic_illness_cols_12, adl_cols_03, adl_cols_12,
                 iadl_cols_03, iadl_cols_12, positive_mood_cols_03, positive_mood_cols_12,
                 negative_mood_cols_03, negative_mood_cols_12, lifestyle_cols_03, lifestyle_cols_12,
                 income_cols_03, income_cols_12, insurance_cols_03, insurance_cols_12, social_engagement_cols,
                 preventive_care_cols_03, preventive_care_cols_12, health_service_usage_cols_03, health_service_usage_cols_12):
        
        # Assign parameters to instance variables
        self.numerical_common_features = numerical_common_features
        self.ordinal_mappings = ordinal_mappings
        self.married_cols_03 = married_cols_03
        self.married_cols_12 = married_cols_12
        self.chronic_illness_cols_03 = chronic_illness_cols_03
        self.chronic_illness_cols_12 = chronic_illness_cols_12
        self.adl_cols_03 = adl_cols_03
        self.adl_cols_12 = adl_cols_12
        self.iadl_cols_03 = iadl_cols_03
        self.iadl_cols_12 = iadl_cols_12
        self.positive_mood_cols_03 = positive_mood_cols_03
        self.positive_mood_cols_12 = positive_mood_cols_12
        self.negative_mood_cols_03 = negative_mood_cols_03
        self.negative_mood_cols_12 = negative_mood_cols_12
        self.lifestyle_cols_03 = lifestyle_cols_03
        self.lifestyle_cols_12 = lifestyle_cols_12
        self.income_cols_03 = income_cols_03
        self.income_cols_12 = income_cols_12
        self.insurance_cols_03 = insurance_cols_03
        self.insurance_cols_12 = insurance_cols_12
        self.social_engagement_cols = social_engagement_cols
        self.preventive_care_cols_03 = preventive_care_cols_03
        self.preventive_care_cols_12 = preventive_care_cols_12
        self.health_service_usage_cols_03 = health_service_usage_cols_03
        self.health_service_usage_cols_12 = health_service_usage_cols_12
        
        # Initialize all custom transformers
        self.temporal_features = TemporalFeatureEngineer(numerical_common_features, ordinal_mappings)
        self.education_progression = EducationProgressionTransformer()
        self.marital_transition = MaritalTransitionTransformer(married_cols_03, married_cols_12)
        self.chronic_illness = ChronicIllnessTransformer(chronic_illness_cols_03, chronic_illness_cols_12)
        self.adl_iadl = ADLIADLTransformer(adl_cols_03, adl_cols_12, iadl_cols_03, iadl_cols_12)
        self.health_assessment_change = HealthAssessmentChangeTransformer()
        self.mood_score = MoodScoreTransformer(positive_mood_cols_03, positive_mood_cols_12, negative_mood_cols_03, negative_mood_cols_12)
        self.consistent_exercise = ConsistentExerciseTransformer()
        self.lifestyle_health_index = LifestyleHealthIndexTransformer(lifestyle_cols_03, lifestyle_cols_12)
        self.socioeconomic_features = SocioeconomicFeaturesTransformer(income_cols_03, income_cols_12, insurance_cols_03, insurance_cols_12)
        self.social_engagement = SocialEngagementTransformer(social_engagement_cols)
        self.health_services = HealthServicesTransformer(preventive_care_cols_03, preventive_care_cols_12, health_service_usage_cols_03, health_service_usage_cols_12)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X = self.temporal_features.transform(X)
        X = self.education_progression.transform(X)
        X = self.marital_transition.transform(X)
        X = self.chronic_illness.transform(X)
        X = self.adl_iadl.transform(X)
        X = self.health_assessment_change.transform(X)
        X = self.mood_score.transform(X)
        X = self.consistent_exercise.transform(X)
        X = self.lifestyle_health_index.transform(X)
        X = self.socioeconomic_features.transform(X)
        X = self.social_engagement.transform(X)
        X = self.health_services.transform(X)
        return X

## Preprocessing Pipeline

In [None]:
def get_preprocessing_pipeline(data, ordinal_mappings):
    """
    Create a preprocessing pipeline
    
    Parameters:
    - data: pandas DataFrame containing the data.
    - ordinal_mappings: dictionary of mappings for ordinal variables.

    Returns:
    - preprocessor: scikit-learn Pipeline object.
    """

    # Identify numerical and categorical columns
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    # Exclude 'composite_score' from numerical_cols
    numerical_cols = [col for col in numerical_cols if col != 'composite_score']

    categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
    
    # Identify ordinal and nominal variables
    ordinal_cols = list(ordinal_mappings.keys())
    nominal_cols = [col for col in categorical_cols if col not in ordinal_cols and col != 'uid']
    
    # Define the lists of columns required for custom transformers
    numerical_common_features = [col[:-3] for col in data.columns if col.endswith('_03') and col[:-3] + '_12' in data.columns]

    married_cols_03 = ['married_03']
    married_cols_12 = ['married_12']

    chronic_illness_cols_03 = ['hypertension_03', 'diabetes_03', 'resp_ill_03', 'arthritis_03', 'hrt_attack_03', 'stroke_03', 'cancer_03']
    chronic_illness_cols_03 = [col for col in chronic_illness_cols_03 if col in data.columns]
    chronic_illness_cols_12 = [col.replace('_03', '_12') for col in chronic_illness_cols_03 if col.replace('_03', '_12') in data.columns]

    adl_cols_03 = ['adl_dress_03', 'adl_walk_03', 'adl_bath_03', 'adl_eat_03', 'adl_bed_03', 'adl_toilet_03']
    adl_cols_03 = [col for col in adl_cols_03 if col in data.columns]
    adl_cols_12 = [col.replace('_03', '_12') for col in adl_cols_03 if col.replace('_03', '_12') in data.columns]

    iadl_cols_03 = ['iadl_money_03', 'iadl_meds_03', 'iadl_shop_03', 'iadl_meals_03']
    iadl_cols_03 = [col for col in iadl_cols_03 if col in data.columns]
    iadl_cols_12 = [col.replace('_03', '_12') for col in iadl_cols_03 if col.replace('_03', '_12') in data.columns]

    positive_mood_cols_03 = ['happy_03', 'enjoy_03', 'energetic_03']
    positive_mood_cols_03 = [col for col in positive_mood_cols_03 if col in data.columns]
    positive_mood_cols_12 = [col.replace('_03', '_12') for col in positive_mood_cols_03 if col.replace('_03', '_12') in data.columns]

    negative_mood_cols_03 = ['depressed_03', 'restless_03', 'lonely_03', 'sad_03', 'tired_03']
    negative_mood_cols_03 = [col for col in negative_mood_cols_03 if col in data.columns]
    negative_mood_cols_12 = [col.replace('_03', '_12') for col in negative_mood_cols_03 if col.replace('_03', '_12') in data.columns]

    lifestyle_cols_03 = ['alcohol_03', 'tobacco_03']
    lifestyle_cols_03 = [col for col in lifestyle_cols_03 if col in data.columns]
    lifestyle_cols_12 = [col.replace('_03', '_12') for col in lifestyle_cols_03 if col.replace('_03', '_12') in data.columns]

    income_cols_03 = ['rearnings_03', 'searnings_03', 'hincome_03', 'hinc_business_03', 'hinc_rent_03', 'hinc_assets_03', 'hinc_cap_03', 'rinc_pension_03', 'sinc_pension_03']
    income_cols_03 = [col for col in income_cols_03 if col in data.columns]
    income_cols_12 = [col.replace('_03', '_12') for col in income_cols_03 if col.replace('_03', '_12') in data.columns]

    insurance_cols_03 = ['imss_03', 'issste_03', 'pem_def_mar_03', 'insur_private_03', 'insur_other_03', 'insured_03']
    insurance_cols_03 = [col for col in insurance_cols_03 if col in data.columns]
    insurance_cols_12 = [col.replace('_03', '_12') for col in insurance_cols_03 if col.replace('_03', '_12') in data.columns]

    social_engagement_cols = [
        'attends_class_12', 'attends_club_12', 'reads_12', 'games_12', 'table_games_12',
        'comms_tel_comp_12', 'tv_12', 'sewing_12', 'act_mant_12',
        'volunteer_12', 'care_adult_12', 'care_child_12',
        'rrfcntx_m_12', 'rsocact_m_12', 'rrelgwk_12'
    ]
    social_engagement_cols = [col for col in social_engagement_cols if col in data.columns]

    preventive_care_cols_03 = ['test_chol_03', 'test_tuber_03', 'test_diab_03', 'test_pres_03']
    preventive_care_cols_03 = [col for col in preventive_care_cols_03 if col in data.columns]
    preventive_care_cols_12 = [col.replace('_03', '_12') for col in preventive_care_cols_03 if col.replace('_03', '_12') in data.columns]

    health_service_usage_cols_03 = ['visit_med_03', 'out_proc_03', 'visit_dental_03']
    health_service_usage_cols_03 = [col for col in health_service_usage_cols_03 if col in data.columns]
    health_service_usage_cols_12 = [col.replace('_03', '_12') for col in health_service_usage_cols_03 if col.replace('_03', '_12') in data.columns]
    if 'hosp_12' in data.columns:
        health_service_usage_cols_12.append('hosp_12')

    # Ordinal Mapper Transformer
    ordinal_mapper_transformer = FunctionTransformer(
        map_ordinal_variables, 
        kw_args={'ordinal_cols': ordinal_cols, 'ordinal_mappings': ordinal_mappings}
    )

    
    # Nominal Transformer
    nominal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    
    # Numerical Transformer
    numerical_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(estimator=HistGradientBoostingRegressor(random_state=42))),
        ('scaler', StandardScaler())
    ])

    # ==============================
    # Instantiate Custom Transformers
    # ==============================

    custom_feature_engineer = CustomFeatureEngineer(
        numerical_common_features, ordinal_mappings,
        married_cols_03, married_cols_12,
        chronic_illness_cols_03, chronic_illness_cols_12,
        adl_cols_03, adl_cols_12,
        iadl_cols_03, iadl_cols_12,
        positive_mood_cols_03, positive_mood_cols_12,
        negative_mood_cols_03, negative_mood_cols_12,
        lifestyle_cols_03, lifestyle_cols_12,
        income_cols_03, income_cols_12,
        insurance_cols_03, insurance_cols_12,
        social_engagement_cols,
        preventive_care_cols_03, preventive_care_cols_12,
        health_service_usage_cols_03, health_service_usage_cols_12
    )

    # Outlier Remover Transformer for numerical columns
    outlier_remover = OutlierRemover(columns=numerical_cols)

    # ==============================
    # Create the Preprocessing Pipeline
    # ==============================

    # Combine all numerical columns including new features from custom transformers
    # Since custom transformers create new features, we need to specify them
    new_numerical_cols = [
        # From TemporalFeatureEngineer
        'time_gap'
    ] + [f"{feature}_change" for feature in numerical_common_features] + [
        'education_transition',
        'marital_transition',
        'chronic_illness_count_03', 'chronic_illness_count_12',
        'total_adl_limitations_03', 'total_adl_limitations_12',
        'total_iadl_limitations_03', 'total_iadl_limitations_12',
        'adl_iadl_progression',
        'health_self_assessment_change',
        'positive_mood_score_03', 'positive_mood_score_12', 'positive_mood_change',
        'negative_mood_score_03', 'negative_mood_score_12', 'negative_mood_change',
        'consistent_exercise',
        'lifestyle_health_index_03', 'lifestyle_health_index_12',
        'aggregate_income_03', 'aggregate_income_12',
        'insurance_coverage_depth_03', 'insurance_coverage_depth_12', 'insurance_continuity',
        'social_engagement_12',
        'preventive_care_index_03', 'preventive_care_index_12', 'preventive_care_change',
        'health_service_usage_03', 'health_service_usage_12', 'health_service_usage_change'
    ]

    all_numerical_cols = numerical_cols + new_numerical_cols
    # Remove duplicates
    all_numerical_cols = list(set(all_numerical_cols))

    # Define the complete preprocessing pipeline
    preprocessor = Pipeline(steps=[
        # Map ordinal variables
        ('ordinal_mapper', ordinal_mapper_transformer),
        # Apply custom feature engineering
        ('custom_feature_engineering', custom_feature_engineer),
    
        # Apply ColumnTransformer after custom features are added
        ('preprocessing', ColumnTransformer(transformers=[
            ('num', numerical_transformer, all_numerical_cols + ordinal_cols),
            # No need for 'ord' transformer as ordinals are mapped and included in numerical
            ('nom', nominal_transformer, nominal_cols)
        ], remainder='drop'))  # Drop any remaining columns
    ])

    return preprocessor


## Modeling

In [None]:
def build_modeling_pipeline(preprocessor):
    """
    Builds a modeling pipeline with preprocessing, feature selection, and a stacked model.

    Parameters:
    - preprocessor: scikit-learn Pipeline object containing preprocessing steps.

    Returns:
    - pipeline: scikit-learn Pipeline object ready for fitting.
    """

    # Define base models for stacking
    base_models = [
        ('gbr', GradientBoostingRegressor(
            learning_rate=0.01, max_depth=5, max_features='sqrt',
            n_estimators=500, subsample=0.8
        )),
        ('xgb', XGBRegressor(
            colsample_bytree=0.8, learning_rate=0.01, max_depth=5,
            n_estimators=500, subsample=0.8, random_state=42
        )),
        ('catboost', CatBoostRegressor(
            iterations=1000, learning_rate=0.01, depth=6,
            verbose=0, random_state=42
        )),
        ('lightgbm', lgb.LGBMRegressor(
            num_leaves=31, learning_rate=0.05, n_estimators=100, random_state=42
        )),
        ('rf', RandomForestRegressor(
            max_depth=20, min_samples_leaf=2, min_samples_split=10,
            n_estimators=500, random_state=42
        ))
    ]
    
    # Stacking regressor with Linear Regression as final estimator
    stacked_model = StackingRegressor(
        estimators=base_models,
        final_estimator=LinearRegression(),
        n_jobs=-1
    )
    
    # Construct the complete pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_regression, k=50)),
        ('model', stacked_model)
    ])
    
    return pipeline

# Putting it all together

## Data preparation


In [None]:
# Create the preprocessing pipeline
preprocessor = get_preprocessing_pipeline(data, ordinal_mappings)

# Define features and target
features = data.columns.drop(['composite_score', 'uid'])
target = 'composite_score'

## Splitting the data

In [None]:
# Split the data using the defined function
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data, features, target)


## Building the Modeling Pipeline

In [None]:
# Build the modeling pipeline
pipeline = build_modeling_pipeline(preprocessor)

## Fitting the Pipeline

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3237
[LightGBM] [Info] Number of data points in the train set: 3474, number of used features: 50
[LightGBM] [Info] Start training from score 157.084917
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3091
[LightGBM] [Info] Number of data points in the train set: 2779, number of used features: 50
[LightGBM] [Info] Start training from score 157.426412
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002860 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

## Evaluating the model using the validation set

In [None]:
# Make predictions on the validation set
y_pred_val = pipeline.predict(X_val)

# Evaluate the model
from sklearn.metrics import mean_squared_error
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
r2_val = r2_score(y_val, y_pred_val)
print(f"Validation RMSE: {rmse_val:.2f}")
print(f"Validation R2 : {r2_val:.2f}")

Validation RMSE: 40.20
Validation R2 : 0.58


## Testing the model on the test set

In [None]:
# Make predictions on the test set
y_pred_test = pipeline.predict(X_test)

# Evaluate the model
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
print(f"Test RMSE: {rmse_test:.2f}")
print(f"Test R2 : {r2_test:.2f}")


Test RMSE: 43.61
Test R2 : 0.52


# Creating a Test Submission

### Loading the submission_format data

In [None]:
# Load the submission format file
submission_format = pd.read_csv('submission_format.csv')

# Display the first few rows to verify
print(submission_format.head())


    uid  year  composite_score
0  abxu  2016                0
1  aeol  2016                0
2  aeol  2021                0
3  afnb  2016                0
4  afnb  2021                0


### Loading the test_features data

In [None]:
# Load the test features
test_features = pd.read_csv('test_features.csv')

### Prepare the Test Data for Prediction
We need to merge submission_format with test_features to ensure we have all the necessary features for the required uid and year combinations.

In [None]:
# Merge submission_format with test_features to get the test data for prediction
test_data = submission_format[['uid', 'year']].merge(
    test_features, on=['uid'], how='left'
)
# Check for missing values after the merge
missing_values = test_data.isnull().sum()
print("Missing values in test data before pipeline:\n", missing_values)


Missing values in test data before pipeline:
 uid              0
year             0
age_03         359
urban_03       359
married_03     359
              ... 
a21_12        1090
a22_12        1091
a33b_12       1090
a34_12         390
j11_12          29
Length: 185, dtype: int64


In [None]:
# Prepare features for prediction
features_to_drop = ['uid']  # Do not drop 'year' since it's needed by the pipeline
X_test = test_data.drop(columns=features_to_drop)

# Ensure that X_test has the same features as used in training
expected_features = X_train.columns.tolist()
# Remove 'uid' from expected_features if present
expected_features = [col for col in expected_features if col != 'uid']

# Reindex X_test to match the expected features
X_test = X_test.reindex(columns=expected_features)

# Confirm lengths before prediction
print(f"Length of submission_format: {len(submission_format)}")
print(f"Length of test_data: {len(test_data)}")
print(f"Length of X_test: {len(X_test)}")

Length of submission_format: 1105
Length of test_data: 1105
Length of X_test: 1105


### Making Predictions


In [None]:
# Make predictions using the trained pipeline
y_pred = pipeline.predict(X_test)

# Confirm the number of predictions
print(f"Length of y_pred: {len(y_pred)}")


Length of y_pred: 1105


### Adding the predictions to the submission_format DataFrame.

In [None]:
# Round the predictions to the nearest integer and convert to int64
y_pred_int = np.round(y_pred).astype(int)

# Assign integer predictions to submission DataFrame
submission = submission_format.copy()
submission['composite_score'] = y_pred_int

# Ensure the submission has the correct columns
submission = submission[['uid', 'year', 'composite_score']]

# Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")

Submission file 'submission.csv' created successfully!


In [None]:
end_time = time.time()
print(f'Time taken to run the notebook: {(end_time - start_time) / 60} minutes')

Time taken to run the notebook: 38.90929878155391 minutes


In [None]:
# Saving the pipeline as a pkl
import joblib

with open('stacked_model.pkl', 'wb') as f:
    joblib.dump(pipeline, f)

In [None]:
print(f"Expected features: {expected_features}")

Expected features: ['year', 'age_03', 'urban_03', 'married_03', 'n_mar_03', 'edu_gru_03', 'n_living_child_03', 'migration_03', 'glob_hlth_03', 'adl_dress_03', 'adl_walk_03', 'adl_bath_03', 'adl_eat_03', 'adl_bed_03', 'adl_toilet_03', 'n_adl_03', 'iadl_money_03', 'iadl_meds_03', 'iadl_shop_03', 'iadl_meals_03', 'n_iadl_03', 'depressed_03', 'hard_03', 'restless_03', 'happy_03', 'lonely_03', 'enjoy_03', 'sad_03', 'tired_03', 'energetic_03', 'n_depr_03', 'cesd_depressed_03', 'hypertension_03', 'diabetes_03', 'resp_ill_03', 'arthritis_03', 'hrt_attack_03', 'stroke_03', 'cancer_03', 'n_illnesses_03', 'exer_3xwk_03', 'alcohol_03', 'tobacco_03', 'test_chol_03', 'test_tuber_03', 'test_diab_03', 'test_pres_03', 'hosp_03', 'visit_med_03', 'out_proc_03', 'visit_dental_03', 'imss_03', 'issste_03', 'pem_def_mar_03', 'insur_private_03', 'insur_other_03', 'insured_03', 'decis_personal_03', 'employment_03', 'age_12', 'urban_12', 'married_12', 'n_mar_12', 'edu_gru_12', 'n_living_child_12', 'migration_12