# 1. Import library
Import all necessary libraries throughout the project.

In [None]:
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns
from sklearn.base import clone, BaseEstimator, RegressorMixin
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import missingno as msno


from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

### Helper function
These functions help us to read and preprocess parquet data.

In [None]:
def process_file(filename, dirname):
    """
    Reads a Parquet file, processes its contents,
    and returns n time series value extracted from the data and an id of a volunteer.

    Parameters:
        dirname (str): The directory path where the file is located.
        filename (str): The filename of the Parquet file to be read. The file is expected to be in a subdirectory
                        named after the `filename` parameter, containing a part file named 'part-0.parquet'.

    Returns:
        tuple: A tuple containing:
            - numpy.ndarray: Flattened time series data of the DataFrame (excluding the 'step' column).
            - str: A substring extracted from the `filename`, split by '=' - this is an ID of the volunteer
             and returning the second part.
    """
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    """
      Loads and preprocesses time series data from multiple files in a directory,
      returning a DataFrame containing n time series features for each volunteer.

      Parameters:
          dirname (str): The directory path containing the time series files to preprocess.

      Returns:
          pd.DataFrame: A DataFrame with the following structure:
              - Columns `stat_0`, `stat_1`, ..., `stat_n`: n time series features extracted from each file.
              - Column `id`: The unique identifiers (derived from filenames) for each volunteer.
    """
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    stats, indexes = zip(*results)

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# 2. Read data
This section is the data loading CSV and time series data, extract time series data part.

In [None]:
# Reading training and test data (CSV)
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')
dict = pd.read_csv('../input/child-mind-institute-problematic-internet-use/data_dictionary.csv')

# Reading and preprocessing time series data from .parquet file
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

In [None]:
# Check if the timeseries data has labels
combine = pd.merge(train_ts, train[['id', 'sii']], on='id', how='left')
# Check the number of data points without labels
combine['sii'].isna().sum()
# => All data in the timeseries have labels

### Checking data's structure

In [None]:
dict

In [None]:
train.info()

In [None]:
test.info()

# 3.Preprocess data
We perform preprocessing for both CSV data and time series data. Specifically, as follows:

## 3.1 Preprocess csv
We perform some techniques as mentioned in the presentation to preprocess CSV data, such as:
* We remove columns that exist in the training data but not in the testing data.
* We fill all missing data (Nan value).
* We remove columns ('season' features) with string values.
* In addition, we generate new important features.

In [None]:
# Check the data status in the train set
msno.bar(train.iloc[:, :train.shape[1]], sort='ascending')

In [None]:
# Check the data status in the test set
msno.bar(test.iloc[:, :test.shape[1]], sort='ascending')

In [None]:
# Check the extra columns in the train set that are not in the test set
different_columns = set(train) - set(test)
different_columns # The 'sii' column is the label and should be kept

In [None]:
common_columns = train.columns.intersection(test.columns) # Get the common columns between train and test

In [None]:
# Create data containing only the columns from the test set
train_df = train[common_columns]

# Reattach the label
train_df['sii'] = train['sii']
train = train_df

test = test[common_columns]

### Encode Season
By using the season_encode helper function - which converts string values ​​to categorical (int) form, we enable decision tree models to learn string features like this.

In [None]:
def season_encode(df, kill_season=False):
    """
    Encodes seasonal data in a DataFrame or removes seasonal columns based on input parameters.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the data to process.
        kill_season (bool, optional):
            - If `True`, removes all columns with "Season" in their names.
            - If `False`, encodes string columns with season-related data using a predefined mapping.
            Default is `False`.

    Returns:
        pd.DataFrame:
            - If `kill_season=True`, a DataFrame with "Season" columns removed.
            - If `kill_season=False`, a DataFrame with seasonal data encoded and the `id` column preserved.

    Mapping:
        The seasonal strings are encoded as follows:
        - 'Spring' -> 1
        - 'Summer' -> 2
        - 'Fall'   -> 3
        - 'Winter' -> 4
        - NaN      -> 0
    """
    if kill_season:
        season_cols = [col for col in df.columns if 'Season' in col]
        df_ = df.drop(season_cols, axis=1)
        return df_

    df_no_id = df.drop(columns='id')
    string_columns = df_no_id.select_dtypes(include=['object']).columns.tolist()

    season_encode_map = {
        'Spring': 1,
        'Summer': 2,
        'Fall': 3,
        'Winter': 4,
        np.nan: 0
    }

    # Apply mapping for all string format columns
    df_no_id[string_columns] = df_no_id[string_columns].apply(lambda col: col.map(season_encode_map))
    df_no_id['id'] = df['id']
    return df_no_id

### Feature Engineering
Based on strong correlations between features in the data, thereby creating features that are more meaningful to prediction.

In [None]:
def feature_engineering(df_):
    """
    Performs feature engineering on a given DataFrame by creating new derived features
    related to physical health, body composition, and internet usage.

    Parameters:
        df_ (pd.DataFrame): The input DataFrame containing the necessary columns for feature calculations.

    Returns:
        pd.DataFrame: A new DataFrame with the original columns and additional engineered features.
    """

    df = df_.copy()
    # Product of BMI and Age
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    # Product of daily internet hours and Age
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    # Product of BMI and daily internet hours
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']

    # Ratio of body fat percentage to BMI
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    # Ratio of fat-free mass index to body fat percentage
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    # Ratio of fat mass index to body fat percentage
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    # Ratio of lean soft tissue to total body water
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    # Product of body fat percentage and basal metabolic rate
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    # Product of body fat percentage and daily energy expenditure
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    # Ratio of basal metabolic rate to weight
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    # Ratio of daily energy expenditure to weight
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    # Ratio of skeletal muscle mass to height
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    # Ratio of skeletal muscle mass to fat mass index
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    # Ratio of total body water to weight
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']


    return df

## 3.2 Preprocess parquet (time series)
In this section, we will learn how to extract and process time series data.

In [None]:
# Checking time series data's size
print(f"2 tập cùng số features: {train_ts.shape[1] == test_ts.shape[1]}")
# True => All fetures exist in both train and test data
print(f"Số features: {train_ts.shape[1]}")

### Helper function auto encoder
This function helps us automatically extract features of time series data using the AutoEncoder architecture.



In [None]:
class AutoEncoder(nn.Module):
    """
    A neural network-based autoencoder for dimensionality reduction and feature extraction.

    The autoencoder consists of an encoder and a decoder:
    - The encoder compresses the input data into a lower-dimensional latent space.
    - The decoder reconstructs the input data from the compressed representation.

    Attributes:
        encoder (nn.Sequential): A feedforward neural network that maps input data
            to a lower-dimensional encoding using a series of linear layers and ReLU activations.
        decoder (nn.Sequential): A feedforward neural network that reconstructs the input data
            from the encoded representation using a series of linear layers and ReLU/Sigmoid activations.

    Parameters:
        input_dim (int): The dimensionality of the input data.
        encoding_dim (int): The dimensionality of the latent space (encoded representation).

    Methods:
        forward(x):
            Passes the input data through the encoder and decoder to produce reconstructed output.

            Parameters:
                x (torch.Tensor): The input data tensor with shape (batch_size, input_dim).

            Returns:
                torch.Tensor: The reconstructed data tensor with shape (batch_size, input_dim).
    """
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    """
    Performs dimensionality reduction using an autoencoder on the given DataFrame.

    This function scales the input data, trains an autoencoder to compress the data
    into a lower-dimensional space, and returns the encoded representation.

    Parameters:
        df (pd.DataFrame): The input data to be encoded.
        encoding_dim (int, optional): The dimensionality of the latent space (encoded representation). Default is 50.
        epochs (int, optional): The number of training epochs. Default is 50.
        batch_size (int, optional): The size of each mini-batch during training. Default is 32.

    Returns:
        pd.DataFrame: A DataFrame containing the encoded representation with column names `Enc_1, Enc_2, ...`.
    """

    # Scale the input data to standardize features
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # Convert scaled data into a PyTorch tensor
    data_tensor = torch.FloatTensor(df_scaled)

    # Initialize the autoencoder with input dimensions and encoding dimensions
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)

    # Define the loss function (Mean Squared Error) and the optimizer (Adam)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())

    # Train the autoencoder model
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            # Get the current mini-batch
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()  # Reset gradients
            reconstructed = autoencoder(batch)  # Forward pass
            loss = criterion(reconstructed, batch)  # Compute reconstruction loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

        # Print the loss every 10 epochs
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')

    # Encode the data using the trained encoder
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()

    # Create a DataFrame for the encoded data
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])

    return df_encoded


### Encode data
Perform data encoding

In [None]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

print("Train encode")
train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
print("Test encode")
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

Loss ko giảm, ko cần nhiều epoch

In [None]:
# Reattach id column
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

# Get all time series columns
time_series_cols = train_ts_encoded.columns.tolist()

## 3.3 Combine data
We perform the concatenation of the encoded string data with the original data.

In [None]:
# Drop "season" columns
train_has_season = season_encode(train, kill_season=True)
test_has_season = season_encode(test, kill_season=True)

# Merge data on "id" column
train_combine = pd.merge(train_has_season, train_ts_encoded, how='left', on='id')
test_combine = pd.merge(test_has_season, test_ts_encoded, how='left', on='id')

#### Fill nan
Fill NaN values by using KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors=6)

train = train_combine
test = test_combine

numeric_cols_train = train.select_dtypes(include=['int32', 'int64', 'float64']).columns
numeric_cols_test = test.select_dtypes(include=['int32', 'int64', 'float64']).columns

imputed_train_data = imputer.fit_transform(train[numeric_cols_train])
imputed_test_data = imputer.fit_transform(test[numeric_cols_test])

train_imputed = pd.DataFrame(imputed_train_data, columns=numeric_cols_train)
test_imputed = pd.DataFrame(imputed_test_data, columns=numeric_cols_test)

train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols_train:
        train_imputed[col] = train[col]

for col in test.columns:
    if col not in numeric_cols_test:
        test_imputed[col] = test[col]

train = train_imputed
test = test_imputed

In [None]:
## Process data after fill / drop "sii" Nan values
train_combine = feature_engineering(train_combine)
test_combine = feature_engineering(test_combine)

## 3.4 Select feature
We perform robust feature selection, based on the correlation matrix. Additionally, we merge the processed time series data with the original data.

In [None]:
# Selecting feature (Drop features, which have >70% Nan values)
train_featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

# Merge features from csv and time series features
train_featuresCols += time_series_cols

test_featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday']

test_featuresCols += time_series_cols

train_combine = train_combine[train_featuresCols]
test_combine = test_combine[test_featuresCols]

In [None]:
## Remove "id" column in both sets
train_combine.drop(columns=['id'], inplace=True)
test_combine.drop(columns=['id'], inplace=True)

In [None]:
# Presentating distribution of label after resolve NaN
train_combine['sii'].value_counts()

In [None]:
# Presentating distribution of label before resolve NaN
train['sii'].value_counts()

In [None]:
# Replacing minus infinity or infinity values to Nan values
train_combine.replace([np.inf, -np.inf], np.nan, inplace=True)
test_combine.replace([np.inf, -np.inf], np.nan, inplace=True)

# 4. Training model

In [None]:
## Hyperparameters
N_SPLITS = 5
SEED = 42

# Parameter for 3 model
LightGBM_Params = {
    'random_state': SEED,
    'verbose':-1,
    'n_estimators': 200,
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01,
    'device': 'cpu',
}


XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': SEED,
    'tree_method': 'gpu_hist',
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': 42,
    'verbose': 0,
    'l2_leaf_reg': 10,
    'task_type': 'GPU'
}


In [None]:
## Useful function
def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculates the quadratic weighted kappa between the true labels and predicted labels.

    Quadratic weighted kappa is a metric that measures the agreement between two categorical variables
    while penalizing disagreements based on the magnitude of the difference.

    Parameters:
        y_true (array-like): The true labels.
        y_pred (array-like): The predicted labels.

    Returns:
        float: The quadratic weighted kappa score between 0 (no agreement) and 1 (perfect agreement).
    """
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')


def threshold_Rounder(oof_non_rounded, thresholds):
    """
    Rounds the continuous predictions to discrete classes based on specified thresholds.

    This function takes a continuous set of predictions and rounds them to the nearest class
    by comparing them against predefined threshold values.

    Parameters:
        oof_non_rounded (array-like): The continuous predictions to be rounded.
        thresholds (list or array-like): The threshold values for classifying the predictions.
            The thresholds should define the boundaries between classes.

    Returns:
        numpy.ndarray: An array of rounded predictions (class labels).
    """
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))


def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    """
    Evaluates the performance of predictions by rounding them based on thresholds and calculating
    the negative quadratic weighted kappa score.

    This function rounds the predictions using the `threshold_Rounder` function and then calculates
    the quadratic weighted kappa between the rounded predictions and the true labels.

    Parameters:
        thresholds (list or array-like): The threshold values for classifying the predictions.
        y_true (array-like): The true labels.
        oof_non_rounded (array-like): The continuous predictions to be rounded.

    Returns:
        float: The negative quadratic weighted kappa score between the true labels and the rounded predictions.
    """
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


In [None]:
## Train and get predict function
def train_predict(model, train_data, test_data):
    """
    Trains a model using Stratified K-Fold cross-validation, evaluates the performance using
    quadratic weighted kappa (QWK) score, and makes predictions on the test data.

    This function performs the following steps:
    1. Splits the training data into K folds and trains the model on each fold.
    2. Evaluates the model on both training and validation sets using quadratic weighted kappa.
    3. Makes predictions on the test data and aggregates the results from all folds.
    4. Optimizes thresholds for classification using a quadratic weighted kappa score.
    5. Returns the final predictions and the trained model.

    Parameters:
        model (sklearn.base.Estimator): The model to be trained and evaluated.
        train_data (pd.DataFrame): The training data containing features and the target label 'sii'.
        test_data (pd.DataFrame): The test data to make predictions on.

    Returns:
        pd.DataFrame: A DataFrame with the final predictions for the test data.
        model: The trained model after the last fold.
        float: The average validation QWK score across all folds.
    """

    # Align train and test input data
    X = train_data.drop(columns=['sii'])
    y = train_data['sii']

    # Define K-Fold cross-validation
    SKF = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

    train_his = []
    val_his = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), N_SPLITS))

    for fold, (train_index, val_index) in enumerate(tqdm(SKF.split(X, y), desc="Train progress", total = N_SPLITS)):
        # Determine the data for the fold
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Train the model
        model_ = clone(model)  # Clone the model to ensure independence at each fold
        model_.fit(X_train, y_train)

        # Compute errors
        y_train_pred = model_.predict(X_train)
        y_val_pred = model_.predict(X_val)

        oof_non_rounded[val_index] = y_val_pred
        y_train_pred_rounded = y_train_pred.round(0).astype(int)
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[val_index] = y_val_pred_rounded

        # Evaluate the model performance
        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred_rounded)
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_his.append(train_kappa)
        val_his.append(val_kappa)

        test_preds[:, fold] = model_.predict(test_data)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")

    print(f"Mean Train QWK --> {np.mean(train_his):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(val_his):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)

    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission, model_, np.mean(val_his)


## 4.1 XGboost  + LightGBM + CatBoost
Initing model section.

### Model

In [None]:
# Create model instances
LightGBM_Model = LGBMRegressor(**LightGBM_Params)
XGBoost_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)


## Calc weight for each model to vote

In [None]:
def evaluate_model(model, X_test, y_test):
    # Predicting
    y_pred = model.predict(X_test)

    # Calc score
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return rmse, mae, r2


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Declare models
models = {
    "LightGBM_Model": LightGBM_Model,
    "XGBoost_Model": XGBoost_Model,
    "CatBoost_Model": CatBoost_Model,
}

X = train_combine.drop(columns=['sii'])
y = train_combine['sii']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Saving result array
results = []

# Training and evaluating models
for name, model in models.items():
    # training model
    model.fit(X_train, y_train)

    # evaluating
    rmse, mae, r2 = evaluate_model(model, X_test, y_test)

    # Saving results
    results.append((name, rmse, mae, r2))

    # Presenting current model's result
    print(f"Model: {name}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R2: {r2:.4f}")
    print("-" * 30)


In [None]:
results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R2"]).sort_values(by="RMSE")

print(results_df)

In [None]:
# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', LightGBM_Model),
    ('xgboost', XGBoost_Model),
    ('catboost', CatBoost_Model),
], weights=[4.0,4.0,5.0])

## Submission


In [None]:
Submission1, model, val = train_predict(voting_model, train_combine, test_combine)

In [None]:
Submission1['sii'].value_counts()

In [None]:
Submission1.to_csv('submission.csv', index=False)

In [None]:
Submission1