In [None]:
import os
import warnings
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor

from scipy.optimize import minimize


from tqdm import tqdm
from IPython.display import clear_output

warnings.filterwarnings('ignore')  
pd.options.display.max_columns = None  



## Method 1

In [2]:
train = pd.read_csv('dataset/train.csv')  
test = pd.read_csv('dataset/test.csv') 
sample = pd.read_csv('dataset/sample_submission.csv')  
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))  
    df.drop('step', axis=1, inplace=True)  
    return df.describe().values.reshape(-1), filename.split('=')[1] 
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname) 

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))  

    stats, indexes = zip(*results)  

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])  
    df['id'] = indexes  
    return df

In [3]:
train_ts = load_time_series("dataset/series_train.parquet")  
test_ts = load_time_series("dataset/series_test.parquet") 

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")  

train = pd.merge(train, train_ts, how="left", on='id')  
test = pd.merge(test, test_ts, how="left", on='id')  

train = train.drop('id', axis=1) 
test = test.drop('id', axis=1)   

100%|██████████| 111/111 [00:02<00:00, 54.32it/s]
100%|██████████| 2/2 [00:00<00:00, 22.82it/s]


In [4]:
# Select Relevant Features and Handle Missing Values
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']


featuresCols += time_series_cols 

train = train[featuresCols] 
train = train.dropna(subset='sii')  


cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
         'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
         'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season'] 

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')  
        df[c] = df[c].astype('category') 
    return df
        
train = update(train)  
test = update(test)  

### Feature Extraction 

In [5]:
def create_mapping(column, dataset):
    unique_values = dataset[column].unique() 
    # to {feat0: 0, feat1: 1, feat2: 2, ...}
    return {value: idx for idx, value in enumerate(unique_values)}  


for col in cat_c:
    mapping = create_mapping(col, train)  
    mappingTe = create_mapping(col, test) 
    
    train[col] = train[col].replace(mapping).astype(int) 
    test[col] = test[col].replace(mappingTe).astype(int)  

### Training model

In [6]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic') 

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1, 
                             np.where(oof_non_rounded < thresholds[2], 2, 3))) 

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [7]:
SEED = 42
n_splits = 5

In [8]:
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)  
    y = train['sii']  

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)  
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float)  
    oof_rounded = np.zeros(len(y), dtype=int)  
    test_preds = np.zeros((len(test_data), n_splits)) 

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]  
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]  
        model = clone(model_class)  
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)  
        y_val_pred = model.predict(X_val)  

        oof_non_rounded[test_idx] = y_val_pred  
        y_val_pred_rounded = y_val_pred.round(0).astype(int) 
        oof_rounded[test_idx] = y_val_pred_rounded  

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))  
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)  

        train_S.append(train_kappa) 
        test_S.append(val_kappa)  
        
        test_preds[:, fold] = model.predict(test_data)  
        
        clear_output(wait=True)  

    
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge." 
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)  
    tKappa = quadratic_weighted_kappa(y, oof_tuned)  

    tpm = test_preds.mean(axis=1)  
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)  
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    }) 
    return submission

In [9]:
# LightGBM
Params = {
    'learning_rate': 0.046, 
    'max_depth': 12, 
    'num_leaves': 478, 
    'min_data_in_leaf': 13, 
    'feature_fraction': 0.893, 
    'bagging_fraction': 0.784, 
    'bagging_freq': 4, 
    'lambda_l1': 10,
    'lambda_l2': 0.01, 
}

XGB_Params = {
    'learning_rate': 0.05, 
    'max_depth': 6, 
    'n_estimators': 200, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'reg_alpha': 1, 
    'reg_lambda': 5, 
    'random_state': SEED, 
}

CatBoost_Params = {
    'learning_rate': 0.05, 
    'depth': 6, 
    'iterations': 200, 
    'random_seed': SEED, 
    'cat_features': cat_c, 
    'verbose': 0, 
    'l2_leaf_reg': 10, 
}


from collections import Counter

class_counts = Counter(train['sii'])  
total_samples = len(train)  
# w = total_sample / class_sample
class_weights = {cls: total_samples / count for cls, count in class_counts.items()} 


Params_with_weights = {
    **Params,
    'class_weight': class_weights
}


Light = LGBMRegressor(**Params_with_weights, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params) 
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)  

voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

# Train the ensemble model
Submission1 = TrainML(voting_model, test) 

Submission1

Training Folds: 100%|██████████| 5/5 [00:27<00:00,  5.49s/it]


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1


## Method 2

In [10]:
# Load data
train = pd.read_csv('dataset/train.csv') 
test = pd.read_csv('dataset/test.csv') 
sample = pd.read_csv('dataset/sample_submission.csv') 

In [11]:
# Merge and Drop Columns
train_ts = load_time_series("dataset/series_train.parquet") 
test_ts = load_time_series("dataset/series_test.parquet")  

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")  

train = pd.merge(train, train_ts, how="left", on='id')  
test = pd.merge(test, test_ts, how="left", on='id')  
train = train.drop('id', axis=1)  
test = test.drop('id', axis=1)  

100%|██████████| 111/111 [00:02<00:00, 53.38it/s]
100%|██████████| 2/2 [00:00<00:00, 20.19it/s]


In [12]:
imputer = KNNImputer(n_neighbors=5) 

numeric_cols = train.select_dtypes(include=['int32', 'int64', 'float64', 'int64']).columns 
imputed_data = imputer.fit_transform(train[numeric_cols])  
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)  
train_imputed['sii'] = train_imputed['sii'].round().astype(int)  
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]  
        
train = train_imputed  

In [13]:
def feature_engineering(df):

    season_cols = [col for col in df.columns if 'Season' in col]  
    df = df.drop(season_cols, axis=1)  # Drop Season (too many missing values)
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']  # BMI and age interactions 
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']  # Internet hours and age interactions
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']  # BMI and Internet hours interactions
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']  # Fat and BMI ratio
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']  # FFMI and Fat ratio
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']  # FMI and Fat ratio
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']  # LST and TBW ratio
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']  # Fat and BMR interactions
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']  # Fat and DEE interactions
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']  # BMR and Weight ratio
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']  # DEE and Weight ratio
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']  # SMM and Height ratio
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']  # Muscle and Fat ratio
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']  #TBW and Weight ratio
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']  # ICW and TBW ratio
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']  # BMI and Heart rate interaction
    return df

train = feature_engineering(train)  
train = train.dropna(thresh=10, axis=0)  # Keep rows with at least 10 non-missing values
test = feature_engineering(test) 

In [14]:
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age', 'Internet_Hours_Age', 'BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight', 'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW', 'BMI_PHR',
                ]

train = train[featuresCols + time_series_cols + ['sii']]
train = train.dropna(subset='sii') 


test = test[featuresCols + time_series_cols]

In [15]:
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan) 

### Training model

In [16]:
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)  
    y = train['sii']  

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)  
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float)  
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits)) 

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]  
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]  

        model = clone(model_class)  
        model.fit(X_train, y_train) 

        y_train_pred = model.predict(X_train)  
        y_val_pred = model.predict(X_val)  

        oof_non_rounded[test_idx] = y_val_pred  
        y_val_pred_rounded = y_val_pred.round(0).astype(int)  
        oof_rounded[test_idx] = y_val_pred_rounded  

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))  
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)  

        train_S.append(train_kappa)  
        test_S.append(val_kappa)  
        
        test_preds[:, fold] = model.predict(test_data)
        
        clear_output(wait=True) 

    # Maxmize Kappa score optimization
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."  
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)  
    tKappa = quadratic_weighted_kappa(y, oof_tuned) 

    tpm = test_preds.mean(axis=1)  
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)  

    return tp_rounded  

In [17]:
# Ensemble Model
imputer = SimpleImputer(strategy='median')  

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),  # LightGBM
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),  # XGBoost
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),  # CatBoost
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),  # Random Forest
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))  # Gradient Boosting
])

Submission2 = TrainML(ensemble, test)  

Training Folds: 100%|██████████| 5/5 [00:59<00:00, 11.88s/it]


In [18]:
Submission2 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission2
})  

Submission2

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


In [19]:
sub1 = Submission1  
sub2 = Submission2  
sub1 = sub1.sort_values(by='id').reset_index(drop=True)  
sub2 = sub2.sort_values(by='id').reset_index(drop=True)  

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii']
})  

def majority_vote(row):
    """
    For each row of predictions, perform majority voting. 
    If there are multiple modes, take their average and round to the nearest integer.

    Parameters:
    - row: A row of prediction values

    Returns:
    - The final predicted 'sii' value
    """
    return row.mode()[0] if len(row.mode()) == 1 else row.mean().round().astype(int)

combined['final_sii'] = combined[['sii_1', 'sii_2']].apply(majority_vote, axis=1)  

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})  
final_submission.to_csv('submission.csv', index=False)  