In [58]:
import numpy as np
import pandas as pd
import os
import copy
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR

from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import random
import torch

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None


In [59]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 42
n_splits = 5
seed_everything(SEED)

# Define function

## Feature engineer for tabular data

In [60]:
from sklearn.feature_selection import SelectKBest, f_regression

def feature_engineering_v2(df, selector=None, imputer=None, fit=True):
    df = df.loc[:, ~df.columns.duplicated()]
    if fit: 
        y = df['sii']

    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    season_cols = [col for col in df.columns if 'Season' in col]
    pciat_cols = [col for col in df.columns if 'PCIAT' in col and 'Season' not in col]
    remaining_numeric_cols = [col for col in numeric_cols if col not in pciat_cols and col not in ['sii']]
    X = df[remaining_numeric_cols]
    print(X.keys())
    if np.any(np.isinf(X)):
        X = X.replace([np.inf, -np.inf], np.nan)
    if fit: 
        imputer = SimpleImputer()
        imputed_data = imputer.fit_transform(X)
        train_imputed = pd.DataFrame(imputed_data, columns=remaining_numeric_cols)
        X = train_imputed
    else:
        X = imputer.transform(X)

    if fit:
        selector = SelectKBest(score_func=f_regression, k=30)
        X_new = selector.fit_transform(X, y)
        selected_features = X.columns[selector.get_support()]
    else: 
        X_new = selector.transform(X)
        selected_features = [col for col, selected in zip(remaining_numeric_cols, selector.get_support()) if selected]
    df_selected = pd.DataFrame(X_new, columns=selected_features)
    print(df_selected.keys())
    return df_selected, selector, imputer

def feature_engineering_tabular(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    
    return df

## Utils functions

In [61]:
# Handle non-numeric columns
def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}


# Function to evaluate the predictions and optimize the thresholds
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

## Function to train the model with tabular data processed by SelectKBest

In [62]:
def TrainML_Sub1(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        X_train, selector_tr, imputer_tr = feature_engineering_v2(X_train, fit=True)
        X_val, _, _ = feature_engineering_v2(X_val, selector_tr, imputer_tr, fit=False)
        # Train the model
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        # Feature engineering for test data
        test_data_fe, _, _ = feature_engineering_v2(test_data, selector_tr, imputer_tr, fit=False)
        test_preds[:, fold] = model.predict(test_data_fe)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOptimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOptimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOptimizer.x)
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOptimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOptimizer.x
    return (submission, tKappa, oof_tuned, oof_non_rounded, y, optimized_thresholds)

## Function to train the model with tabular data processed by FENet

In [63]:
def TrainML(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOPtimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOPtimizer.x
    return submission, oof_tuned, oof_non_rounded, y, optimized_thresholds

# Define features

## Normal features

In [64]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

total_features = list(test.columns)
total_features.remove('id')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

In [65]:
noseason_features = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','BMI_PHR']

# Submission 

In [66]:
train_sub1 = train
test_sub1 = test
train_sub1 = train_sub1.dropna(subset='sii')

In [67]:
X_sub1 = train_sub1
y_sub1 = train_sub1['sii']

In [68]:
SVR_Best_Params = {
    'C': 0.1,
    'epsilon': 0.1,
    'kernel': 'rbf',
    'gamma': 'scale',
}

CatBoost_Best_Params = {
    'learning_rate': 0.0021172579310639343,
    'depth': 6,
    'iterations': 130,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 0.32557701990001503,
    'task_type': 'GPU',  
    'devices': '0'
}

XGB_Best_Params = {
    'n_estimators': 700,
    'max_depth': 4,
    'learning_rate': 0.03325152156380898,
    'subsample': 0.25295047248406266,
    'colsample_bytree': 0.9760859719849787,
    'gamma': 0.20085951790463402,
    'min_child_weight': 11,
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'gpu_id': 0
}

LightGBM_Best_Params = {
    'max_depth': 3,
    'min_data_in_leaf': 40,
    'num_leaves': 190,
    'learning_rate': 0.05107368421432176,
    'feature_fraction': 0.9918350138636185,
    'bagging_fraction': 0.9331400899763774,
    'bagging_freq': 1,
    'lambda_l1': 9.49641646280519,
    'lambda_l2': 2.446305429623661,
    'min_gain_to_split': 0.05262124930522051,
    'device_type': 'gpu',
    'gpu_device_id': 0,
    'verbosity': -1
}

catboost_model = CatBoostRegressor(**CatBoost_Best_Params)
xgb_model = XGBRegressor(**XGB_Best_Params)
lightgbm_model = LGBMRegressor(**LightGBM_Best_Params)
svr_model = SVR(**SVR_Best_Params)

final_voting_model = VotingRegressor(estimators=[
    ('lightgbm', lightgbm_model),
    ('xgboost', xgb_model),
    ('catboost', catboost_model),
], weights=[4.0, 4.0, 4.0])

X = train.drop(['sii'], axis=1)
y = train['sii']

In [69]:
submission1, val_score_sub1, _, _, _, _ = TrainML_Sub1(lightgbm_model, X_sub1, y_sub1, test_sub1)

print("Val score sub1 with best parameters:", val_score_sub1)

Training Folds: 100%|██████████| 5/5 [00:06<00:00,  1.22s/it]

Mean Train QWK --> 0.4354
Mean Validation QWK ---> 0.3546





OPTIMIZED THRESHOLDS [0.57815622 1.02536737 2.55343637]
----> || Optimized QWK SCORE :: [36m[1m 0.472[0m
Val score sub1 with best parameters: 0.4724975689845031


# Final

In [70]:
final_submission = submission1
final_submission.to_csv('submission.csv', index=False)

print("Submission saved to 'submission.csv'")

Submission saved to 'submission.csv'


In [71]:
final_submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,2
9,0083e397,1
