In [1]:
import numpy as np
import pandas as pd
import os
import re
import copy
import pickle
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, KFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
import shap
import plotly.express as px

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
SEED = 42
n_splits = 5

In [2]:
import datetime
import gc
import os
import sys
from glob import glob
import matplotlib.pyplot as plt
from pathlib import Path

from glob import glob
import numpy as np
import pandas as pd
import polars as pl
import torch
import yaml
from tqdm import tqdm

# TRAIN_OR_TEST = "train"

# paths = glob(
#     f"/kaggle/input/child-mind-institute-problematic-internet-use/series_{TRAIN_OR_TEST}.parquet/id=*/part-0.parquet"
# )
# print(len(paths))

In [3]:
import random
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(100)

# Define function

## Feature engineer for sub 1,2,3,4,5

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
def feature_engineering_v2(df, selector=None, imputer=None, fit=True):
    df = df.loc[:, ~df.columns.duplicated()]
    if fit: 
        y = df['sii']

    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    season_cols = [col for col in df.columns if 'Season' in col]
    pciat_cols = [col for col in df.columns if 'PCIAT' in col and 'Season' not in col]
    remaining_numeric_cols = [col for col in numeric_cols if col not in pciat_cols and col not in ['sii']]
    X = df[remaining_numeric_cols]
    if np.any(np.isinf(X)):
        X = X.replace([np.inf, -np.inf], np.nan)
    if fit: 
        imputer = KNNImputer()
        imputed_data = imputer.fit_transform(X)
        train_imputed = pd.DataFrame(imputed_data, columns=remaining_numeric_cols)
        X = train_imputed
    else:
        X = imputer.transform(X)

    if fit:
        # estimator = RandomForestRegressor(random_state=42)
        # selector = RFECV(estimator, min_features_to_select=5, step=3, cv=5)
        selector = SelectKBest(score_func=f_regression, k=30)
        X_new = selector.fit_transform(X, y)
        selected_features = X.columns[selector.get_support()]
    else: 
        X_new = selector.transform(X)
        selected_features = [col for col, selected in zip(remaining_numeric_cols, selector.get_support()) if selected]
    df_selected = pd.DataFrame(X_new, columns=selected_features)
    return df_selected, selector, imputer


## AutoEncoder for Sub 1,2,3,4,5

In [5]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, input_dim),
            nn.Sigmoid()
        )

        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 50 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')    
    return autoencoder, scaler

def encode_data(autoencoder, scaler, df):
    df_scaled = scaler.transform(df)
    data_tensor = torch.FloatTensor(df_scaled)
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()

    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    return df_encoded

## TrainML for Sub 2,3,4,5

In [6]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)
    
def TrainML(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOPtimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOPtimizer.x
    return submission, oof_tuned, oof_non_rounded, y, optimized_thresholds

In [7]:
def TrainML_Sub1(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # Feature engineering for training and validation
        X_train, selector_tr, imputer_tr = feature_engineering_v2(X_train, fit=True)
        X_val, _, _ = feature_engineering_v2(X_val, selector_tr, imputer_tr, fit=False)
        # Train the model
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        # Feature engineering for test data
        test_data_fe, _, _ = feature_engineering_v2(test_data, selector_tr, imputer_tr, fit=False)
        test_preds[:, fold] = model.predict(test_data_fe)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOptimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOptimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOptimizer.x)
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOptimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOptimizer.x
    return (submission, tKappa, oof_tuned, oof_non_rounded, y, optimized_thresholds)

# Define features

## Normal features

In [8]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

total_features = list(test.columns)
total_features.remove('id')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

In [9]:
noseason_features = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','BMI_PHR']
print(len(noseason_features))

64


## Loading timeseries

In [10]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [01:07<00:00, 14.68it/s]
100%|██████████| 2/2 [00:00<00:00, 12.14it/s]


In [11]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)
autoencoder, scaler = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)

Epoch [50/100], Loss: 1.4613]
Epoch [100/100], Loss: 1.4550]


In [12]:
train_ts_encoded = encode_data(autoencoder, scaler, df_train)
test_ts_encoded = encode_data(autoencoder, scaler, df_test)
test_ts_encoded.reset_index(inplace=True, drop=True)

In [13]:
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

## Features timeseries

In [14]:
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")
time_encoded_cols = train_ts_encoded.columns.tolist()
time_encoded_cols.remove("id")

# Submission 1

In [15]:
train_sub1 = pd.merge(train, train_ts_encoded, how="left", on='id')
test_sub1 = pd.merge(test, test_ts_encoded, how="left", on='id')
train_sub1 = train_sub1.dropna(subset='sii')

In [16]:
X_sub1 = train_sub1
y_sub1 = train_sub1['sii']

In [17]:
CatBoost_Best_Params = {
    'learning_rate': 0.0021172579310639343,
    'depth': 6,
    'iterations': 130,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 0.32557701990001503,
}

XGB_Best_Params = {
    'n_estimators': 700,
    'max_depth': 4,
    'learning_rate': 0.03325152156380898,
    'subsample': 0.25295047248406266,
    'colsample_bytree': 0.9760859719849787,
    'gamma': 0.20085951790463402,
    'min_child_weight': 11,
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    # 'tree_method': 'gpu_hist',
    # 'predictor': 'gpu_predictor',
    # 'gpu_id': 0
}

LightGBM_Best_Params = {
    'max_depth': 3,
    'min_data_in_leaf': 40,
    'num_leaves': 190,
    'learning_rate': 0.05107368421432176,
    'feature_fraction': 0.9918350138636185,
    'bagging_fraction': 0.9331400899763774,
    'bagging_freq': 1,
    'lambda_l1': 9.49641646280519,
    'lambda_l2': 2.446305429623661,
    'min_gain_to_split': 0.05262124930522051,
    # 'device_type': 'gpu',
    # 'gpu_device_id': 0,
    'verbosity': -1
}

svm

catboost_model = CatBoostRegressor(**CatBoost_Best_Params)
xgb_model = XGBRegressor(**XGB_Best_Params)
lightgbm_model = LGBMRegressor(**LightGBM_Best_Params)
# tabnet_model = TabNetWrapper(
#     n_d=64, n_a=64, n_steps=5, gamma=1.5, n_independent=2, n_shared=2, 
#     lambda_sparse=1e-4, optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax',
#     scheduler_params=dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
#     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau, verbose=1,
#     device_name='cuda' if torch.cuda.is_available() else 'cpu'
# )

final_voting_model = VotingRegressor(estimators=[
    ('lightgbm', lightgbm_model),
    ('xgboost', xgb_model),
    ('catboost', catboost_model),
    # ('tabnet', tabnet_model)
], weights=[4.0, 4.0, 4.0])

X = train.drop(['sii'], axis=1)
y = train['sii']

In [18]:
submission1, val_score_sub1, _, _, _, _ = TrainML_Sub1(catboost_, X_sub1, y_sub1, test_sub1)

print("Val score sub1 with best parameters:", val_score_sub1)

Training Folds: 100%|██████████| 5/5 [00:12<00:00,  2.53s/it]

Mean Train QWK --> 0.5441
Mean Validation QWK ---> 0.3645





OPTIMIZED THRESHOLDS [0.5785469  0.88500199 2.83677574]
----> || Optimized QWK SCORE :: [36m[1m 0.462[0m
Val score sub1 with best parameters: 0.46241743563848414


# Submision 2: Sleep Detection Pipeline

First, prepare the features used in my sleep detection model. Please refer to the implementation by [@tatamikenn](https://www.kaggle.com/tatamikenn) [here](https://www.kaggle.com/code/tatamikenn/sleep-hdcza-a-pure-heuristic-approach-lb-0-447).

This pipeline processes accelerometer data for sleep detection, utilizing time-series datasets. It generates features to identify sleep episodes, static periods, and motion patterns, inspired by @tatamikenn's implementation.

### transform Function
The transform function processes input data to generate features for analysis. It breaks down the timestamp into components like year, month, day, hour, and weekday. It also groups data by night, adjusting the timestamp if necessary, and creates a unique night_group identifier for each night. Additionally, a cumulative step count (norm_step) is computed for each group to facilitate sequential analysis.

### transform_series Function
enhances the transform function by adding a new feature: detecting clipped ENMO values. It flags instances where the enmo (motion metric) is zero, marking potential data quality issues.

### transform_events Function
processes event data by adding a night column and pivoting the data. The events are rearranged by series_id, group_id, and night to simplify time-series analysis.

### add_feature Function
This function generates advanced features for sleep detection, including:

Difference Features: Computes the differences in anglez (angular motion) and enmo (motion magnitude).

Rolling Median: Calculates rolling medians of anglez_diff and enmo_diff over a 5-minute window.

Critical Threshold: Determines static periods by evaluating anglez_diff variability over a day.

Static and Sleep Blocks: Flags periods with minimal motion (is_static) and identifies sleep blocks over 30-minute windows.

Sleep Episodes: Detects continuous sleep episodes, identifies the longest one, and flags interruptions in sleep.

### create_heuristic Function

The main function processes raw data files by converting timestamps and applying transformations. It calls the transform_series function to prepare the data and the add_feature function to generate sleep-related features. Finally, it saves the processed data into .parquet files for further analysis.

In [19]:
MAX_FILE = 2000

In [20]:
def transform(df, night_offset=20):
    return (
        df.with_columns(
            [
                (pl.col("timestamp").dt.year() - 2000).cast(pl.Int8).alias("year"),
                pl.col("timestamp").dt.month().cast(pl.Int8).alias("month"),
                pl.col("timestamp").dt.day().cast(pl.Int8).alias("day"),
                pl.col("timestamp").dt.hour().cast(pl.Int8).alias("hour"),
                pl.col("timestamp").dt.minute().cast(pl.Int8).alias("minute"),
                pl.col("timestamp").dt.second().cast(pl.Int8).alias("second"),
                pl.col("timestamp").dt.weekday().cast(pl.Int8).alias("weekday"),
            ]
        )
        .with_columns( 
            pl.when(pl.col("hour") < night_offset)
            .then(pl.col("timestamp"))
            .otherwise(pl.col("timestamp") + pl.duration(days=1))
            .dt.date()
            .alias("night_group"),
        )
        .with_columns(
            [
                (
                    pl.col("series_id") + pl.lit("_") + pl.col("night_group").cast(pl.Datetime).dt.strftime("%Y%m%d")
                ).alias("group_id"),
            ]
        )
        .with_columns(
            [
                pl.col("timestamp").cum_count().over("group_id").alias("norm_step"),
            ]
        )
        .drop(["night_group"])
    )


def transform_series(df):
    return transform(df).with_columns(
        [
            (pl.col("enmo") == 0).alias("is_enmo_clipped"),
        ]
    )


def transform_events(df):
    return (
        transform(df)
        .with_columns(
            [
                pl.col("night").cast(pl.UInt32).alias("night"),
            ]
        )
        .pivot(["step", "timestamp", "tz_offset"], ["series_id", "group_id", "night"], "event")
    )


def add_feature(
    df,
    day_group_col="group_id",
    term1=(5 * 60) // 5,
    term2=(30 * 60) // 5,
    term3=(60 * 60) // 5,
    min_threshold=0.005,
    max_threshold=0.04,
    center=True,
):
    return (
        df.with_columns(
            [
                pl.col("anglez").diff(1).abs().alias("anglez_diff"),
                pl.col("enmo").diff(1).abs().alias("enmo_diff"),
            ]
        )
        .with_columns(
            [
                pl.col("anglez_diff")
                .rolling_median(term1, center=center)  # 5 min window
                .alias("anglez_diff_median_5min"),
                pl.col("enmo_diff")
                .rolling_median(term1, center=center)  # 5 min window
                .alias("enmo_diff_median_5min"),
            ]
        )
        .with_columns(
            [
                pl.col("anglez_diff_median_5min")
                .quantile(0.1)
                .clip(min_threshold, max_threshold)
                .over(day_group_col)
                .alias("critical_threshold")
            ]
        )
        .with_columns([(pl.col("anglez_diff_median_5min") < pl.col("critical_threshold") * 15).alias("is_static")])
        .with_columns(
            [
                pl.col("is_static").cast(pl.Int32).rolling_sum(term2, center=center).alias("is_static_sum_30min"),
            ]
        )
        .with_columns([(pl.col("is_static_sum_30min") == ((30 * 60) // 5)).alias("tmp")])
        .with_columns(
            [
                pl.col("tmp").shift(term2 // 2).alias("tmp_left"),
                pl.col("tmp").shift(-(term2 // 2)).alias("tmp_right"),
            ]
        )
        .with_columns(
            [
                (pl.col("tmp_left") | pl.col("tmp_right")).alias("is_sleep_block"),
            ]
        )
        .drop(["tmp", "tmp_left", "tmp_right"])
        .with_columns([pl.col("is_sleep_block").not_().alias("is_gap")])
        .with_columns([pl.col("is_gap").cast(pl.Int32).rolling_sum(term3, center=center).alias("gap_length")])
        .with_columns([(pl.col("gap_length") == term3).alias("tmp")])
        .with_columns(
            [
                pl.col("tmp").shift(term3 // 2).alias("tmp_left"),
                pl.col("tmp").shift(-(term3 // 2)).alias("tmp_right"),
            ]
        )
        .with_columns(
            [
                (pl.col("tmp_left") | pl.col("tmp_right")).alias("is_large_gap"),
            ]
        )
        .drop(["tmp", "tmp_left", "tmp_right"])
        .with_columns([pl.col("is_large_gap").not_().alias("is_sleep_episode")])
        #
        # extract longest sleep episode
        #
        .with_columns(
            [
                # extract false->true transition
                (
                    (
                        pl.col("is_sleep_episode")
                        & pl.col("is_sleep_episode").shift(1, fill_value=pl.lit(False)).not_()
                    )
                    .cum_sum()
                    .over("group_id")
                ).alias("sleep_episode_id")
            ]
        )
        .with_columns(
            [pl.col("is_sleep_episode").sum().over(["group_id", "sleep_episode_id"]).alias("sleep_episode_length")]
        )
        .with_columns([pl.col("sleep_episode_length").max().over(["group_id"]).alias("max_sleep_episode_length")])
        .with_columns(
            [
                (
                    pl.col("is_sleep_episode") & (pl.col("sleep_episode_length") == pl.col("max_sleep_episode_length"))
                ).alias("is_longest_sleep_episode")
            ]
        )
    )


use_columns = [
    "series_id",
    "step",
    "is_longest_sleep_episode",
    "is_sleep_block",
    "is_gap",
    "is_large_gap",
    "is_sleep_episode",
    "is_static",
]

def create_heuristic(paths, train_or_test):
    i = 0
    for path in tqdm(paths):
        i += 1
        if (i == MAX_FILE):
            break
        sdf = pl.read_parquet(path)
    
        # dummy timestamp
        sdf = sdf.with_columns((pl.col("time_of_day") == 0).cast(pl.Int32).cum_sum().alias("day_offset"))
        sdf = sdf.with_columns(
            (
                datetime.datetime(2020, 1, 1)
                + (pl.col("day_offset") * 86400_000_000 + pl.col("time_of_day") / 1000).cast(pl.Duration("us"))
            ).alias("timestamp")
        )
    
        sdf = sdf.with_columns(pl.lit(path.split("/")[-2]).alias("series_id"))
        sdf = sdf.sort("step")
        sdf = transform_series(sdf)
        sdf = add_feature(sdf)
        sdf = sdf[use_columns].fill_null(False)
    
        sidf = path.split("/")[-2]
        save_path = f"/kaggle/working/heuristic_features/{train_or_test}/{sidf}.parquet"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        sdf.write_parquet(save_path)

### 1. Description
The detection function is designed to detect sleep patterns from accelerometer data using a Transformer-GRU model. It processes the input data by extracting key features such as motion metrics, static periods, and sleep episodes, then applies a pre-trained ensemble model to generate predictions for sleep onset and wakeup times.

### 2. Input Handling and Setup
The function begins by loading paths to the input data (series_train.parquet) and reading model configurations from a config.yaml file. It initializes the Transformer-GRU model (ZzzTransformerGRUModel) and loads pre-trained weights to enable ensemble inference.

### 3. Feature Engineering
Several key features are extracted from the input data. Motion metrics such as anglez and enmo are calculated along with their differences (anglez_diff, enmo_diff). Static periods are identified by features like same_count (repeated motion) and large_diff_count (sudden changes). The longest sleep episodes and blocks are flagged from the heuristic features, providing insight into sleep patterns.

### 4. Splitting Data into Blocks
To facilitate efficient processing, the time-series data is divided into smaller blocks based on the configured BLOCK_SIZE. This approach ensures proper alignment with the model's patch size for input during inference.

### 5. Model Inference
The processed data is converted into a patch-based dataset (ZzzPatchDataset), which is fed into the Transformer-GRU model for inference. The ensemble model generates predictions for sleep onset and wakeup times, which are stored for further analysis.

### 6. Aggregating and Saving Predictions
Once predictions are made, they are grouped by series_id and step. These grouped predictions are aggregated to produce final outputs. The results are saved as .parquet files, organized by the train_or_test mode, ensuring easy retrieval and further analysis.

In [21]:
if True:
    sys.path.append("/kaggle/input/cmi-2023-src")
    from consts import ANGLEZ_MEAN, ANGLEZ_STD, ENMO_MEAN, ENMO_STD
    from torch_models.dataset import ZzzPatchDataset
    from torch_models.models import ZzzConv1dGRUModel, ZzzTransformerGRUModel, ZzzWaveGRUModel

    from utils.feature_contena import Features
    from utils.lightning_utils import MyLightningDataModule, MyLightningModule
    from utils.set_seed import seed_base_torch
    from utils.torch_template import EnsembleModel

In [22]:
def detection(paths=f"/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=*/part-0.parquet", train_or_test="train"):
    MODEL_NAME = "patch_transformer_gru"
    
    PACKAGE_DIR = Path("/kaggle/input/cmi-2023-src")
    CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
    BLOCK_SIZE = CFG[MODEL_NAME]["execution"]["block_size"]
    
    CFG["output_dir"] = f"/kaggle/input/cmi-2023-output/{CFG[MODEL_NAME]['execution']['best_exp_id']}"
    
    seed_base_torch(CFG["env"]["seed"])
    
    DEVICE = "cuda"
    
    files = glob(
        paths
    )
    
    features = Features()
    features.add_num_features(["anglez", "enmo"])
    features.add_num_features(["anglez_diff", "enmo_diff"])
    features.add_num_features(["same_count"])
    features.add_num_features(["large_diff_count"])
    features.add_num_features(["same_count_shift_plus", "same_count_shift_minus"])
    features.add_num_features(["is_longest_sleep_episode", "is_sleep_block"])
    
    # transformer + gru
    model = ZzzTransformerGRUModel(
        max_len=BLOCK_SIZE // CFG[MODEL_NAME]["execution"]["patch_size"],
        input_numerical_size=len(features.all_features()) * CFG[MODEL_NAME]["execution"]["patch_size"],
        **CFG[MODEL_NAME]["params"],
    )
    trn_models = [
        MyLightningModule.load_from_checkpoint(
            os.path.join("/kaggle/input/cmi-2023-output/exp_160", f"logs/best_model_fold{fold}.ckpt"),
            model=model,
            map_location=torch.device(DEVICE),
        ).to(DEVICE)
        for fold in range(5 if len(files) > 100 else 1)
    ]
    
    models = trn_models
    model = EnsembleModel(models).to(DEVICE)
    model.eval()
    
    all_oof_dfs = []
    i = 0
    for file in tqdm(files):
        # load file
        i += 1
        if (i == MAX_FILE):
            break
        df = pd.read_parquet(file)
        if len(df) < BLOCK_SIZE:
            continue
        time_of_days = df["time_of_day"].values
    
        # same_count
        DAY_STEPS = 12 * 60 * 24
        n_days = int(len(df) // DAY_STEPS) + 1
        df["same_count"] = 0
        for day in range(-n_days, n_days + 1):
            if day == 0:
                continue
            df["_anglez_diff"] = df["anglez"].diff(DAY_STEPS * day)
            df["_anglez_diff"] = df["_anglez_diff"].fillna(1)
            df["same_count"] += (df["_anglez_diff"] == 0).astype(int)
        df["same_count"] = (df["same_count"].clip(0, 5) - 2.5) / 2.5
    
        SHIFT_STEPS = 12 * 60 * 6  # 6h
        df["same_count_shift_plus"] = df["same_count"].shift(SHIFT_STEPS).fillna(1.0).astype(np.float16)
        df["same_count_shift_minus"] = df["same_count"].shift(-SHIFT_STEPS).fillna(1.0).astype(np.float16)
    
        # features
        df["anglez_diffabs"] = df["anglez"].diff().abs().fillna(0)
        df["large_diff"] = (df["anglez_diffabs"] > 5).astype(int)
        df["large_diff_count"] = df["large_diff"].rolling(10, center=True).mean().fillna(0)
        df["large_diff_count"] = (df["large_diff_count"] - 0.5) * 2
    
        # normalize
        df["anglez"] = (df["anglez"] - ANGLEZ_MEAN) / ANGLEZ_STD
        df["enmo"] = (df["enmo"] - ENMO_MEAN) / ENMO_STD
        df["anglez_diff"] = df["anglez"].diff().fillna(0)
        df["enmo_diff"] = df["enmo"].diff().fillna(0)
    
        # heuristic_features by @bilzard
        sid = file.split("/")[-2]
        df["series_id"] = sid
        path = f"/kaggle/working/heuristic_features/{train_or_test}/{sid}.parquet"
        hdf = pd.read_parquet(path)
        df = pd.concat([df, hdf.drop(columns=["series_id", "step"])], axis=1)
        df[["is_longest_sleep_episode", "is_sleep_block"]] = df[["is_longest_sleep_episode", "is_sleep_block"]] * 2 - 1
    
        # split
        dfs = []
        df = df.sort_values("step").reset_index(drop=True)
        for start in range(0, len(df), BLOCK_SIZE // 8):
            end = start + BLOCK_SIZE
            if end > len(df):
                end = len(df) - len(df) % CFG[MODEL_NAME]["execution"]["patch_size"]
                start = end - BLOCK_SIZE
                assert start >= 0
            assert df.iloc[start]["step"] % CFG[MODEL_NAME]["execution"]["patch_size"] == 0
            dfs.append(df.iloc[start:end])
        gc.collect()
    
        # inference
        train_dataset = ZzzPatchDataset(
            dfs, mode="test", features=features, patch_size=CFG[MODEL_NAME]["execution"]["patch_size"]
        )
        valid_dataset = ZzzPatchDataset(
            dfs, mode="test", features=features, patch_size=CFG[MODEL_NAME]["execution"]["patch_size"]
        )
        data_module = MyLightningDataModule(train_dataset, valid_dataset, batch_size=64)
        preds = []
        with torch.no_grad():
            for X in data_module.val_dataloader():
                pred = torch.sigmoid(model(X.to("cuda"))).detach().cpu().numpy() * 10
                preds.append(pred)
    
        oof_dfs = []
        for pred, df in zip(np.vstack(preds), dfs):
            df = df.iloc[
                CFG[MODEL_NAME]["execution"]["patch_size"] // 2 : len(df) : CFG[MODEL_NAME]["execution"]["patch_size"]
            ].reset_index(drop=True)
            df[["wakeup_oof", "onset_oof"]] = pred
            oof_dfs.append(df[["series_id", "step", "wakeup_oof", "onset_oof"]])
    
        oof_df = pd.concat(oof_dfs)
        oof_df = oof_df.groupby(["series_id", "step"]).mean().reset_index().sort_values(["series_id", "step"])
        oof_df = oof_df[["series_id", "step", "wakeup_oof", "onset_oof"]]
        oof_df["step"] = oof_df["step"].astype(int)
    
        del preds, oof_dfs
        gc.collect()
    
        train = oof_df.reset_index(drop=True)
        train["time_of_day"] = time_of_days[
            CFG[MODEL_NAME]["execution"]["patch_size"] // 2 :: CFG[MODEL_NAME]["execution"]["patch_size"]
        ][: len(train)]
        all_oof_dfs.append(train[["series_id", "step", "wakeup_oof", "onset_oof", "time_of_day"]])
        # del dfs, df
        gc.collect()

    # save
    for df in tqdm(all_oof_dfs):
        save_path = f"/kaggle/working/features/sleep_detection/{train_or_test}/{df['series_id'].iloc[0]}.parquet"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_parquet(save_path, index=False)

## Feature Engineering

This function processes sleep detection data by extracting and aggregating meaningful features from sensor and time-series datasets. It reads multiple parquet files containing step and sensor data, filters for valid time intervals, and interpolates missing data points. The function identifies sleep onset and wakeup periods, calculates related metrics (e.g., sleep duration, activity levels, and light intensity), and stores these in a structured (csv in working space)

In [24]:
time_of_day_max = 86400000000000
# all_files = sorted(glob("/kaggle/working/features/sleep_detection/*.parquet"))
# len(all_files)

In [25]:
def feature_engineering(paths="/kaggle/working/features/sleep_detection/train/*.parquet", data_paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet", train_or_test="train"):
    features = []
    debug_count = 0
    all_files = sorted(glob(paths))
    i = 0
    for file in tqdm(all_files):
        i += 1
        if (i == MAX_FILE):
            break
        df = pl.read_parquet(file)
        df = df.with_columns(pl.col("step").cast(pl.UInt32)).drop("time_of_day")
        sid = df["series_id"][0]
    
        sensor_df = pl.read_parquet(
            f"{data_paths}/{sid}/part-0.parquet"
        ).with_columns((pl.col("time_of_day") == 0).cum_sum().alias("day"))
    
        feature = {
            "id": sid,
            "length": df.shape[0],
            "day": sensor_df["relative_date_PCIAT"].max() - sensor_df["relative_date_PCIAT"].min(),
        }
    
        # skip if time step is not 5sec
        diffs = sensor_df["time_of_day"].diff().drop_nulls().unique()
        if set(diffs) != set([-86395000000000, 5000000000]):
            features.append(feature)
            continue
    
        sensor_df = (
            sensor_df.join(df, on="step", how="left")
            .sort("step")
            .with_columns(
                pl.col("onset_oof").interpolate(),
                pl.col("wakeup_oof").interpolate(),
            )
        )
    
        # onset = 15:00~3:00, wakeup = 3:00~15:00
        onset_start = time_of_day_max / 24 * 15  # 15:00
        onset_end = time_of_day_max / 24 * 3  # 3:00
        sensor_df = sensor_df.with_columns(
            ((pl.col("time_of_day") > onset_start) | (pl.col("time_of_day") < onset_end)).alias("onset_duration"),
        ).with_columns(
            pl.col("onset_duration").cast(pl.Int32).diff().fill_null(0).abs().cum_sum().alias("onset_wakeup_duration")
        )
    
        # get sleep period
        sleep_info = []
        for _, df in sensor_df.group_by("onset_wakeup_duration", maintain_order=True):
            is_onset = df["onset_duration"][0]
            if is_onset:
                max_idx = df["onset_oof"].arg_max()
                if max_idx is None:
                    continue
                max_score = df["onset_oof"][max_idx]
                step = df["step"][max_idx]
    
                # date
                start_time = df["time_of_day"][0] / time_of_day_max * 24
                if start_time >= 15:
                    day = df["day"][0]
                    week_day = df["weekday"][0]
                else:
                    day = df["day"][0] - 1
                    week_day = df["weekday"][0] - 1
                    if week_day == 0:
                        week_day = 7
            else:
                max_idx = df["wakeup_oof"].arg_max()
                if max_idx is None:
                    continue
                max_score = df["wakeup_oof"][max_idx]
                step = df["step"][max_idx]
    
                # date
                start_time = df["time_of_day"][0] / time_of_day_max * 24
                day = df["day"][0] - 1
                week_day = df["weekday"][0] - 1
    
            info = {
                "day": day,
                "weekday": week_day,
                "type": "onset" if is_onset else "wakeup",
                "step": step,
                "max_score": max_score,
                "time": df["time_of_day"][max_idx] / time_of_day_max * 24,
            }
            sleep_info.append(info)
        sleep_df = pl.DataFrame(sleep_info)
    
        # merge
        sleep_df = (
            sleep_df.filter(pl.col("type") == "onset")
            .drop("type")
            .rename(
                {
                    "max_score": "onset_score",
                    "step": "onset_step",
                    "time": "onset_time",
                }
            )
            .join(
                sleep_df.filter(pl.col("type") == "wakeup")
                .drop(["type", "weekday"])
                .rename(
                    {
                        "max_score": "wakeup_score",
                        "step": "wakeup_step",
                        "time": "wakeup_time",
                    }
                ),
                on="day",
            )
        ).select(
            ["day", "weekday", "onset_time", "wakeup_time", "onset_step", "wakeup_step", "onset_score", "wakeup_score"]
        )
    
        # feature engineering
        sleep_lengths = []  # wakeup - onset
        sleep_enmo_mean = []  
        sleep_enmo_std = []  
        sleep_light_mean = []
        sleep_light_std = [] 
        for i in range(len(sleep_df)):
            # sleep period
            start = sleep_df["onset_step"][i]
            end = sleep_df["wakeup_step"][i]
            if sleep_df["onset_score"][i] < 1 or sleep_df["wakeup_score"][i] < 1:
                sleep_lengths.append(np.nan)
                sleep_enmo_mean.append(np.nan)
                sleep_enmo_std.append(np.nan)
                sleep_light_mean.append(np.nan)
                sleep_light_std.append(np.nan)
                continue
    
            # sleep length
            length = end - start
            sleep_lengths.append(length * 5 / 60 / 60)  # hour
    
            # enmo
            enmo_mean = sensor_df["enmo"][start:end].mean()
            enmo_std = sensor_df["enmo"][start:end].std()
            sleep_enmo_mean.append(enmo_mean)
            sleep_enmo_std.append(enmo_std)
    
            # light
            light_mean = sensor_df["light"][start:end].mean()
            light_std = sensor_df["light"][start:end].std()
            sleep_light_mean.append(light_mean)
            sleep_light_std.append(light_std)
            
        sleep_df = sleep_df.with_columns(
            pl.DataFrame(
                {
                    "sleep_length": sleep_lengths,
                    "sleep_enmo_mean": sleep_enmo_mean,
                    "sleep_enmo_std": sleep_enmo_std,
                    "sleep_light_mean": sleep_light_mean,
                    "sleep_light_std": sleep_light_std,
                }
            )
        )
        
        # leave only high confidence periods
        sleep_df = sleep_df.filter((pl.col("wakeup_score") > 1) & (pl.col("onset_score") > 1))
        if debug_count < 3:
            display(sleep_df.head())
        debug_count += 1
            
    
        # agg
        feature.update(
            {
                "sleep_measurement_count": sleep_df.shape[0],
                "sleep_length_mean": sleep_df["sleep_length"].mean(),
                "sleep_length_std": sleep_df["sleep_length"].std(),
                "sleep_start_mean": sleep_df["onset_time"].mean(),
                "sleep_start_std": sleep_df["onset_time"].std(),
                "sleep_end_mean": sleep_df["wakeup_time"].mean(),
                "sleep_end_std": sleep_df["wakeup_time"].std(),
                "sleep_enmo_mean_mean": sleep_df["sleep_enmo_mean"].mean(),
                "sleep_enmo_mean_std": sleep_df["sleep_enmo_mean"].std(),
                "sleep_enmo_std_mean": sleep_df["sleep_enmo_std"].mean(),
                "sleep_enmo_std_std": sleep_df["sleep_enmo_std"].std(),
                "sleep_light_mean_mean": sleep_df["sleep_light_mean"].mean(),
                "sleep_light_mean_std": sleep_df["sleep_light_mean"].std(),
                "sleep_light_std_mean": sleep_df["sleep_light_std"].mean(),
                "sleep_light_std_std": sleep_df["sleep_light_std"].std(),
            }
        )
        features.append(feature)
    output_dir = f"/kaggle/working/features/{train_or_test}"
    os.makedirs(output_dir, exist_ok=True)
    feature_df = pl.DataFrame(features).with_columns(pl.col("id").str.slice(3, 8))
    feature_df.write_csv(f"/kaggle/working/features/{train_or_test}/sleep_features.csv")
    print(feature_df.head())

In [26]:
create_heuristic(paths=glob("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet/id=*/part-0.parquet"), train_or_test="test")
detection(paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet/id=*/part-0.parquet", train_or_test="test")
feature_engineering(paths="/kaggle/working/features/sleep_detection/test/*.parquet", data_paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet", train_or_test="test")


100%|██████████| 2/2 [00:00<00:00,  3.25it/s]
100%|██████████| 2/2 [00:04<00:00,  2.49s/it]
100%|██████████| 2/2 [00:00<00:00, 66.05it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

day,weekday,onset_time,wakeup_time,onset_step,wakeup_step,onset_score,wakeup_score,sleep_length,sleep_enmo_mean,sleep_enmo_std,sleep_light_mean,sleep_light_std
i64,i64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64
0,2,22.091667,7.041667,7854,14298,5.939998,6.883044,8.95,0.003588,0.008487,2.06153,0.52031
1,3,22.641667,8.141667,25530,32370,7.395478,2.857819,9.5,0.002427,0.006132,2.714605,1.259364
2,4,21.575,7.558333,42042,49230,5.715631,6.638568,9.983333,0.003959,0.007149,6.441472,2.73119
3,5,23.141667,8.241667,60450,67002,8.010484,3.987538,9.1,0.006016,0.007928,9.246452,14.259801
4,6,22.925,7.008333,77574,83394,8.050978,2.848315,8.083333,0.009862,0.012524,0.511077,0.28813


100%|██████████| 2/2 [00:00<00:00, 17.82it/s]

shape: (2, 18)
┌──────────┬────────┬──────┬─────────────┬───┬─────────────┬─────────────┬────────────┬────────────┐
│ id       ┆ length ┆ day  ┆ sleep_measu ┆ … ┆ sleep_light ┆ sleep_light ┆ sleep_ligh ┆ sleep_ligh │
│ ---      ┆ ---    ┆ ---  ┆ rement_coun ┆   ┆ _mean_mean  ┆ _mean_std   ┆ t_std_mean ┆ t_std_std  │
│ str      ┆ i64    ┆ f64  ┆ t           ┆   ┆ ---         ┆ ---         ┆ ---        ┆ ---        │
│          ┆        ┆      ┆ ---         ┆   ┆ f64         ┆ f64         ┆ f64        ┆ f64        │
│          ┆        ┆      ┆ i64         ┆   ┆             ┆             ┆            ┆            │
╞══════════╪════════╪══════╪═════════════╪═══╪═════════════╪═════════════╪════════════╪════════════╡
│ 00115b9f ┆ 3610   ┆ 44.0 ┆ null        ┆ … ┆ null        ┆ null        ┆ null       ┆ null       │
│ 001f3379 ┆ 33033  ┆ 23.0 ┆ 7           ┆ … ┆ 4.917133    ┆ 4.370878    ┆ 3.281733   ┆ 4.990464   │
└──────────┴────────┴──────┴─────────────┴───┴─────────────┴─────────────┴──




In [27]:
import numpy as np
import pandas as pd
import os
import re
import copy
import pickle
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

import plotly.express as px

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
SEED = 42
n_splits = 5

In [80]:

def TrainML_sub2(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))
    
    X[np.isinf(X)] = 0.0
    test_data[np.isinf(test_data)] = 0.0
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOPtimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOPtimizer.x
    return submission, oof_tuned, oof_non_rounded, y, optimized_thresholds


In [52]:
train_sleep = pd.read_csv("/kaggle/input/sleep-detection/sleep_features.csv")
test_sleep = pd.read_csv("/kaggle/working/features/test/sleep_features.csv")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/features/test/sleep_features.csv'

In [None]:
sleep_cols = train_sleep.columns.tolist()
sleep_cols.remove("id")

In [53]:
rm -rf /kaggle/working/features


In [54]:
rm -rf /kaggle/working/heuristic_features

In [55]:
def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df["Feat_0"] = df["Physical-Height"] * df["PAQ_C-PAQ_C_Total"]
    df["Feat_1"] = df["FGC-FGC_TL_Zone"] * df["Physical-Height"]
    df["Feat_2"] = df["PreInt_EduHx-computerinternet_hoursday"] * df["BIA-BIA_Activity_Level_num"]
    df["Feat_3"] = df["Fitness_Endurance-Time_Sec"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_4"] = df["CGAS-CGAS_Score"] / df["FGC-FGC_CU_Zone"]
    df["Feat_5"] = df["Basic_Demos-Age"] / df["FGC-FGC_SRR_Zone"]
    df["Feat_7"] = df["PAQ_C-PAQ_C_Total"] * df["BIA-BIA_Frame_num"]
    df["Feat_9"] = df["FGC-FGC_GSD"] / df["SDS-SDS_Total_Raw"]
    df["Feat_10"] = df["PAQ_A-PAQ_A_Total"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_11"] = df["BIA-BIA_LDM"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_14"] = df["BIA-BIA_BMI"] / df["SDS-SDS_Total_Raw"]
    df["Feat_15"] = df["Physical-Height"] * df["SDS-SDS_Total_T"]
    df["Feat_16"] = df["Physical-Height"] * df["Physical-Height"]
    df["Feat_17"] = df["FGC-FGC_SRL_Zone"] / df["Physical-Weight"]
    df["Feat_18"] = df["Basic_Demos-Sex"] * df["Basic_Demos-Sex"]
    df["Feat_19"] = df["FGC-FGC_GSND_Zone"] / df["BIA-BIA_Fat"]
    
    return df

In [45]:
noseason_features = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'Feat_0', 'Feat_1', 'Feat_2', 'Feat_3', 'Feat_4', 'Feat_5',
        'Feat_7','Feat_9', 'Feat_10', 'Feat_11', 'Feat_14', 'Feat_15',
       'Feat_16', 'Feat_17', 'Feat_18', 'Feat_19']

In [40]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'cpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [42]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
# TabNet_Model = TabNetWrapper(**TabNet_Params) 
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    # ('tabnet', TabNet_Model)
],weights=[4.0,4.0,5.0])


In [69]:
train_sub2 = pd.merge(train, train_sleep, how="left", on='id')
test_sub2 = pd.merge(test, test_sleep, how="left", on='id')

# imputer = KNNImputer(n_neighbors=5)
# numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
# imputed_data = imputer.fit_transform(train_sub2[numeric_cols])
# train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
# train_imputed['sii'] = train_imputed['sii'].round().astype(int)

# for col in train_sub2.columns:
#     if col not in numeric_cols:
#         train_imputed[col] = train_sub2[col]
        
# train_sub2 = train_imputed

train_sub2 = feature_engineering(train_sub2)
train_sub2 = train_sub2.dropna(subset='sii', ignore_index=True)
test_sub2 = feature_engineering(test_sub2)

train_sub2 = train_sub2.drop('id', axis=1)
test_sub2  = test_sub2.drop('id', axis=1)   

In [70]:
features_sub2 = noseason_features + sleep_cols

# train_sub2 = pd.merge(train, train_ts, how="left", on='id')
# test_sub2 = pd.merge(test, test_ts, how="left", on='id')

train_sub2 = train_sub2.dropna(subset='sii')
if np.any(np.isinf(train_sub2)):
    train_sub2 = train_sub2.replace([np.inf, -np.inf], np.nan)

X_sub2 = train_sub2[features_sub2]
y_sub2 = train_sub2['sii']
test_sub2 = test_sub2[features_sub2]


In [81]:
submission2, _, _, _, _= TrainML_sub2(voting_model, X_sub2, y_sub2, test_sub2)

Training Folds: 100%|██████████| 5/5 [00:16<00:00,  3.33s/it]

Mean Train QWK --> 0.7458
Mean Validation QWK ---> 0.4002





OPTIMIZED THRESHOLDS [0.5250443  1.02169925 2.83555884]
----> || Optimized QWK SCORE :: [36m[1m 0.450[0m


# Submission 6: MAE

## Load data

In [None]:
train_ts = load_time_series("../input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("../input/child-mind-institute-problematic-internet-use/series_test.parquet")

In [None]:
train = pd.read_csv('../input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('../input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('../input/child-mind-institute-problematic-internet-use/sample_submission.csv')


featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii',
               
                 'PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
                'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08',
                'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12',
                'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16',
                'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total',
]

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season',
        'PCIAT-Season',
        ]


time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols += time_series_cols

train = train[featuresCols]
# train = train.dropna(subset='sii')

def update(df):
    global cat_c
    for c in cat_c: 
        if c not in df.columns: continue
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    if col in train.columns:
        if 'Season' in col:
            mapping = {
                'Missing': float('nan'),
                'Spring': 0.,
                'Summer': 1.,
                'Fall': 2.,
                'Winter': 3.,
            }
        else:
            mapping = create_mapping(col, train)
            print(f'{col}: {mapping}')
        train[col] = train[col].replace(mapping)
    if col in test.columns:
        if 'Season' in col:
            mappingTe = {
                'Missing': float('nan'),
                'Spring': 0.,
                'Summer': 1.,
                'Fall': 2.,
                'Winter': 3.,
            }
        else:
            mappingTe = create_mapping(col, test)
            print(f'{col}: {mappingTe}')
            
        test[col] = test[col].replace(mappingTe)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


## Define PyTorch MAE

In [None]:
# current implementation: only support numerical values

from functools import partial
from tkinter import E

import torch
import numpy as np
import torch.nn as nn
import pandas as pd
from timm.models.vision_transformer import Block

# current implementation: only support numerical values
import numpy as np
import torch, os
from torch import nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import math
import argparse
import random

class MaskEmbed(nn.Module):
    """ record to mask embedding
    """
    def __init__(self, rec_len=25, embed_dim=64, norm_layer=None):
        
        super().__init__()
        self.rec_len = rec_len
        self.proj = nn.Conv1d(1, embed_dim, kernel_size=1, stride=1)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        B, _, L = x.shape
        # assert(L == self.rec_len, f"Input data width ({L}) doesn't match model ({self.rec_len}).")
        x = self.proj(x)
        x = x.transpose(1, 2)
        x = self.norm(x)
        return x


class ActiveEmbed(nn.Module):
    """ record to mask embedding
    """
    def __init__(self, rec_len=25, embed_dim=64, norm_layer=None):
        
        super().__init__()
        self.rec_len = rec_len
        self.proj = nn.Conv1d(1, embed_dim, kernel_size=1, stride=1)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        B, _, L = x.shape
        # assert(L == self.rec_len, f"Input data width ({L}) doesn't match model ({self.rec_len}).")
        x = self.proj(x)
        x = torch.sin(x)
        x = x.transpose(1, 2)
        #   x = torch.cat((torch.sin(x), torch.cos(x + math.pi/2)), -1)
        x = self.norm(x)
        return x



def get_1d_sincos_pos_embed(embed_dim, pos, cls_token=False):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """

    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float32)
    omega /= embed_dim / 2.
    omega = 1. / 10000**omega  # (D/2,)

    pos = np.arange(pos)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out) # (M, D/2)
    emb_cos = np.cos(out) # (M, D/2)

    pos_embed = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)

    if cls_token:
        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)

    return pos_embed


def adjust_learning_rate(optimizer, epoch, lr, min_lr, max_epochs, warmup_epochs):
    """Decay the learning rate with half-cycle cosine after warmup"""
    if epoch < warmup_epochs:
        tmp_lr = lr * epoch / warmup_epochs 
    else:
        tmp_lr = min_lr + (lr - min_lr) * 0.5 * \
            (1. + math.cos(math.pi * (epoch - warmup_epochs) / (max_epochs - warmup_epochs)))
    for param_group in optimizer.param_groups:
        if "lr_scale" in param_group:
            param_group["lr"] = tmp_lr * param_group["lr_scale"]
        else:
            param_group["lr"] = tmp_lr
    return tmp_lr


def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = [p for p in parameters if p.grad is not None]
    norm_type = float(norm_type)
    if len(parameters) == 0:
        return torch.tensor(0.)
    device = parameters[0].grad.device
    if norm_type == np.inf:
        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
    else:
        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
    return total_norm


class NativeScaler:

    state_dict_key = "amp_scaler"
    def __init__(self):
        self._scaler = torch.cuda.amp.GradScaler()

    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
        self._scaler.scale(loss).backward(create_graph=create_graph)
        if update_grad:
            if clip_grad is not None:
                assert parameters is not None
                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
            else:
                self._scaler.unscale_(optimizer)
                norm = get_grad_norm_(parameters)
            self._scaler.step(optimizer)
            self._scaler.update()
        else:
            norm = None
        return norm

    def state_dict(self):
        return self._scaler.state_dict()

    def load_state_dict(self, state_dict):
        self._scaler.load_state_dict(state_dict)



class MAEDataset(Dataset):

    def __init__(self, X, M):        
         self.X = X
         self.M = M

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx: int):
        return self.X[idx], self.M[idx]



def get_dataset(dataset : str, path : str):

    if dataset in ['climate', 'compression', 'wine', 'yacht', 'spam', 'letter', 'credit', 'raisin', 'bike', 'obesity', 'airfoil', 'blood', 'yeast', 'health', 'review', 'travel']:
        df = pd.read_csv(os.path.join(path, 'data', dataset + '.csv'))
        last_col = df.columns[-1]
        y = df[last_col]
        X = df.drop(columns=[last_col])
    elif dataset == 'california':
        from sklearn.datasets import fetch_california_housing
        X, y = fetch_california_housing(as_frame=True, return_X_y=True)
    elif dataset == 'diabetes':
        from sklearn.datasets import load_diabetes
        X, y = load_diabetes(as_frame=True, return_X_y=True)
    elif dataset == 'iris':
        # only for testing
        from sklearn.datasets import load_iris
        X, y = load_iris(as_frame=True, return_X_y=True)
    

    return X, y


eps = 1e-6
import torch
import torch.nn as nn
import numpy as np
from torch.nn import TransformerEncoderLayer
from transformers.models.bert.modeling_bert import BertPooler

class MaskedAutoencoder(nn.Module):
    def __init__(self, rec_len=25, embed_dim=64, depth=4, num_heads=4,
                 decoder_embed_dim=64, decoder_depth=2, decoder_num_heads=4,
                 mlp_ratio=4., cls_mlp_dim=64, norm_layer=nn.LayerNorm, norm_field_loss=False,
                 encode_func='linear', dropout=0.0):
        super().__init__()
        
        self.rec_len = rec_len
        self.embed_dim = embed_dim
        self.norm_field_loss = norm_field_loss
        
        # Encoder
        if encode_func == 'active':
            self.mask_embed = ActiveEmbed(rec_len, embed_dim, norm_layer)
        else:
            self.mask_embed = MaskEmbed(rec_len, embed_dim, norm_layer)
        
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, rec_len + 1, embed_dim), requires_grad=False)
        
        encoder_layer = TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=int(embed_dim * mlp_ratio),
                        dropout=dropout, batch_first=True)
        self.blocks = nn.TransformerEncoder(encoder_layer, depth)
        self.norm = norm_layer(embed_dim)
        
        
        self.enc_pooler = nn.ModuleList([
            nn.Sequential(
                nn.Linear(embed_dim, 1),
            ),
            nn.Sequential(
                nn.Linear((rec_len + 1) * 1, cls_mlp_dim), nn.ReLU(),
            ),
        ])
        self.enc_lbl_pred = nn.Sequential(
            nn.Linear(cls_mlp_dim, cls_mlp_dim), nn.ReLU(),
            # nn.Linear(cls_mlp_dim, cls_mlp_dim), nn.ReLU(),
            # nn.Dropout(dropout),
            nn.Linear(cls_mlp_dim, cls_mlp_dim), nn.Sigmoid(),
        )
        
        # Decoder
        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, rec_len + 1, decoder_embed_dim), requires_grad=False)
        
        decoder_layer = TransformerEncoderLayer(d_model=decoder_embed_dim, nhead=decoder_num_heads, dim_feedforward=int(decoder_embed_dim * mlp_ratio),
                        dropout=dropout, batch_first=True)
        self.decoder_blocks = nn.TransformerEncoder(decoder_layer, decoder_depth)
        self.decoder_norm = norm_layer(decoder_embed_dim)
        self.decoder_pred = nn.Linear(decoder_embed_dim, 1, bias=True)
        
        
        self.initialize_weights()

    def initialize_weights(self):
        pos_embed = get_1d_sincos_pos_embed(self.pos_embed.shape[-1], self.rec_len, cls_token=True)
        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
        
        decoder_pos_embed = get_1d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], self.rec_len, cls_token=True)
        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
        
        torch.nn.init.xavier_uniform_(self.mask_embed.proj.weight.view([self.mask_embed.proj.weight.shape[0], -1]))
        torch.nn.init.normal_(self.cls_token, std=.02)
        torch.nn.init.normal_(self.mask_token, std=.02)
        
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def random_masking(self, x, m, mask_ratio, training=None):
        N, L, D = x.shape
        if training is None:
            training = self.training
        if training:
            len_keep = int(L * (1 - mask_ratio))
            noise = torch.rand(N, L, device=x.device)
            noise[m < 1e-6] = 1
            ids_shuffle = torch.argsort(noise, dim=1)
            ids_restore = torch.argsort(ids_shuffle, dim=1)
            ids_keep = ids_shuffle[:, :len_keep]
            mask = torch.ones([N, L], device=x.device)
            mask[:, :len_keep] = 0
            mask = torch.gather(mask, dim=1, index=ids_restore)
            mask = torch.logical_or(mask, ~m.bool())
            nask = ~mask
            return mask, nask
        else:
            mask = ~m.bool()
            nask = m.bool()
            return mask, nask

    def forward_encoder(self, x, m, mask_ratio=0.5, training=None):
        x = self.mask_embed(x)
        x = x + self.pos_embed[:, 1:, :]
        mask, nask = self.random_masking(x, m, mask_ratio, training)
        x = x * (~mask.unsqueeze(-1)).float()
        cls_token = self.cls_token + self.pos_embed[:, :1, :]
        cls_tokens = cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        attn_mask = torch.cat((torch.zeros(x.shape[0], 1, device=x.device), mask), dim=1)
        x = self.blocks(src=x, src_key_padding_mask=attn_mask.bool())
        x = self.norm(x)
        return x, mask, nask

    def forward_decoder(self, x, mask):
        x = self.decoder_embed(x)
        x = x + self.decoder_pos_embed
        mask_with_cls = torch.cat((torch.zeros(x.shape[0], 1, device=x.device), mask), dim=1)
        x = self.blocks(src=x, src_key_padding_mask=mask_with_cls.bool())
        
        x = self.decoder_norm(x)
        x = self.decoder_pred(x)
        x = x[:, 1:, :].sigmoid()
        return x

    def forward_loss(self, data, pred, m, mask, nask):
        target = data.squeeze(dim=1)
        # if self.norm_field_loss:
        #     mean = target.mean(dim=-1, keepdim=True)
        #     var = target.var(dim=-1, keepdim=True)
        #     target = (target - mean) / (var + 1e-6) ** 0.5
        rec_mask = mask * m
        loss = (pred.squeeze(dim=2) - target) ** 2
        loss = (loss * rec_mask).sum() / (rec_mask.sum() + 1e-6) + (loss * nask).sum() / (nask.sum() + 1e-6)
        return loss

    def forward(self, data, m):
        x, _, _ = self.forward_encoder(data, m, 0.0, False)
        B = x.shape[0]
        h = self.enc_pooler[1](self.enc_pooler[0](x).reshape(B, -1))
        enc_pred = self.enc_lbl_pred(h)[:, 0]
        return enc_pred

    def forward_selfsl(self, data, m, mask_ratio=0.5, training=None):
        x, mask, nask = self.forward_encoder(data, m, mask_ratio, training)
        pred = self.forward_decoder(x, mask)
        loss = self.forward_loss(data, pred, m, mask, nask)
        return loss, (loss.item(), )

    def forward_sl(self, data, m, lbl_cols):
        num_lbls = len(lbl_cols)
        lbl_mask = m[:, lbl_cols]
        ft_mask = m.clone()
        ft_mask[:, lbl_cols] = 0
        ft = data.clone()
        ft[:, :, lbl_cols] = 0
        x, _, _ = self.forward_encoder(ft, ft_mask, 0.0, False)
        B = x.shape[0]
        h = self.enc_pooler[1](self.enc_pooler[0](x).reshape(B, -1))
        enc_pred = self.enc_lbl_pred(h)[:, :num_lbls]

        tgt = data[:, 0, lbl_cols]
        enc_loss = (((enc_pred - tgt) ** 2) * lbl_mask).sum() / (lbl_mask.sum() + 1e-6)
        
        loss = enc_loss
        
        return loss, (enc_loss.item(), )
      
    def forward_semisl(self, data, m, lbl_cols, ema_model=None, hard=False):
        num_lbls = len(lbl_cols)
        lbl_mask = m[:, lbl_cols]
        nlbl_mask = 1 - m[:, lbl_cols]
        ft_mask = m.clone()
        ft_mask[:, lbl_cols] = 0
        ft = data.clone()
        ft[:, :, lbl_cols] = 0
        
        noise = torch.randn_like(ft)
        noise_norm = torch.norm(noise, p=2, dim=-1, keepdim=True)
        noise = noise / (noise_norm + 1e-8)
        noise = noise * 0.0
        if not hard: noise *= 0
        
        x, _, _ = self.forward_encoder(torch.clamp(ft + noise, min=0.0, max=1.0), ft_mask, 0.0, False)
        B = x.shape[0]
        h = self.enc_pooler[1](self.enc_pooler[0](x).reshape(B, -1))
        enc_pred = self.enc_lbl_pred(h)[:, :num_lbls]
        
        if ema_model is None: raise NotImplementedError()
        # with torch.no_grad():
        #     x_tgt, _, _ = ema_model.forward_encoder(ft, ft_mask, 0.0, False)
            
        #     B = x.shape[0]
        #     h_tgt = ema_model.enc_pooler[1](ema_model.enc_pooler[0](x_tgt).reshape(B, -1))
        #     semisl_tgt = ema_model.enc_lbl_pred(h_tgt)[:, :num_lbls].detach()
        #     if hard:
        #         semisl_tgt[:, 0] = (semisl_tgt[:, 0] * 3.).round() / 3.
        #         semisl_weight = 1.0
        #     else:
        #         semisl_weight = 0.1
        
        
        # semisl_loss = (((enc_pred - semisl_tgt) ** 2) * nlbl_mask).sum() / (nlbl_mask.sum() + 1e-6)

        
        with torch.no_grad():
            if hard:
                x_tgt, _, _ = ema_model.forward_encoder(ft, ft_mask, 0.0, False)
                
                B = x_tgt.shape[0]
                h_tgt = ema_model.enc_pooler[1](ema_model.enc_pooler[0](x_tgt).reshape(B, -1))
                semisl_tgt = ema_model.enc_lbl_pred(h_tgt)[:, :num_lbls].detach()
                semisl_tgt[:, 0] = (semisl_tgt[:, 0] * 3.).round() / 3.
                semisl_weight = 1.0
                semisl_loss = (
                    0.5 * (((enc_pred - semisl_tgt) ** 2) * nlbl_mask).sum() / (nlbl_mask.sum() + 1e-6) # all labels
                    + 0.5 * (((enc_pred[:, 0] - semisl_tgt[:, 0]) ** 2) * nlbl_mask[:, 0]).sum() / (nlbl_mask[:, 0].sum() + 1e-6) # sii only
                )
            else:
                semisl_weight = 0.0
                semisl_loss = torch.tensor([0.0]).to(x.device)
                

        

        sl_tgt = data[:, 0, lbl_cols]
        sl_loss = (((enc_pred - sl_tgt) ** 2) * lbl_mask).sum() / (lbl_mask.sum() + 1e-6)
        
        loss = 1.0 * sl_loss + semisl_weight * semisl_loss
        return loss, (sl_loss.item(), semisl_loss.item(),)
      
      

## Define sklearn-like api

In [None]:
# stdlib
from typing import Any, List, Tuple, Union

# third party
import numpy as np
import math, sys, argparse
import pandas as pd
import torch
from torch import nn
from functools import partial
import time, os, json
from torch.utils.data import DataLoader, RandomSampler
import sys
import timm.optim.optim_factory as optim_factory
import torch.nn.functional as F
import copy


def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')



eps = 1e-8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from argparse import Namespace
remasker_args = Namespace(
    batch_size=64,
    max_epochs= 600,
    accum_iter=1,
    mask_ratio=0.5,
    embed_dim=32,
    depth=6,
    decoder_depth=4,
    num_heads=4,
    mlp_ratio=4.0,
    encode_func='linear',
    norm_field_loss=False,
    weight_decay=0.05,
    lr=None, blr=0.001,
    min_lr=1e-05,
    warmup_epochs=40,
    device='cuda', seed=SEED, overwrite=True, pin_mem=True
)



def set_dropout_p(m, p):
    if isinstance(m, nn.Dropout):
        m.p = p

def update_ema_variables(model, ema_model, alpha, global_step, max_global_step):
    # Use the true average until the exponential average is more correct
    def f(alpha, t, T):
        A = 1
        B = alpha * T / (T - 1)
        return (B * (1 - A / (t + 1)))
    current_alpha = f(alpha, global_step, max_global_step)
    for ema_param, param in zip(ema_model.parameters(), model.parameters()):
        ema_param.data.mul_(current_alpha).add_(1 - current_alpha, param.data)


class ReMasker:

    def __init__(self, args=remasker_args):

        self.batch_size = args.batch_size
        self.accum_iter = args.accum_iter
        self.min_lr = args.min_lr
        self.norm_field_loss = args.norm_field_loss
        # self.weight_decay = args.weight_decay
        self.lr = args.lr
        self.blr = args.blr
        self.warmup_epochs = max(1, args.max_epochs // 10)
        self.ema_decay = args.ema_decay
        self.model = None
        self.norm_parameters = None

        self.embed_dim = args.embed_dim
        self.depth = args.depth
        self.decoder_depth = args.decoder_depth
        self.num_heads = args.num_heads
        self.mlp_ratio = args.mlp_ratio
        self.cls_mlp_dim = args.cls_mlp_dim
        self.max_epochs = args.max_epochs
        self.mask_ratio = args.mask_ratio
        self.encode_func = args.encode_func
        self.dropout = args.dropout


    def fit(self, X_raw: pd.DataFrame, X_val=None, lbl_cols=None, model=None):
        global dbg_var
        X = X_raw.clone()

        # Parameters
        no = len(X)
        dim = len(X[0, :])

        X = X.cpu()

        min_val = np.zeros(dim)
        max_val = np.zeros(dim)

        for i in range(dim):
            min_val[i] = np.nanmin(X[:, i])
            max_val[i] = np.nanmax(X[:, i])
            X[:, i] = (X[:, i] - min_val[i]) / (max_val[i] - min_val[i] + eps)

        self.norm_parameters = {"min": min_val, "max": max_val}

        # Set missing
        M = 1 - (1 * (np.isnan(X)))
        M = M.float().to(device)

        X = torch.nan_to_num(X)
        X = X.to(device)

        if model is None:
            self.model = MaskedAutoencoder(
                rec_len=dim,
                embed_dim=self.embed_dim,
                depth=self.depth,
                num_heads=self.num_heads,
                decoder_embed_dim=self.embed_dim,
                decoder_depth=self.decoder_depth,
                decoder_num_heads=self.num_heads,
                mlp_ratio=self.mlp_ratio,
                cls_mlp_dim=self.cls_mlp_dim,
                norm_layer=partial(nn.LayerNorm, eps=eps),
                norm_field_loss=self.norm_field_loss,
                encode_func=self.encode_func,
                dropout=self.dropout,
            )
        else:
            self.model = copy.deepcopy(model)
            for param in self.model.blocks.layers[:].parameters():
                param.detach_()
        self.ema_model = copy.deepcopy(self.model)
        self.ema_model.apply(lambda m: set_dropout_p(m, p=0.0))
        for param in self.ema_model.parameters():
            param.detach_()
        

        self.model.to(device)
        self.ema_model.to(device).eval()

        # set optimizers
        # param_groups = optim_factory.add_weight_decay(model, args.weight_decay)
        eff_batch_size = self.batch_size * self.accum_iter
        if self.lr is None:  # only base_lr is specified
            self.lr = self.blr * eff_batch_size / 64
        # param_groups = optim_factory.add_weight_decay(self.model, self.weight_decay)
        # optimizer = torch.optim.AdamW(param_groups, lr=self.lr, betas=(0.9, 0.95))
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, betas=(0.9, 0.95))
        loss_scaler = NativeScaler()

        dataset = MAEDataset(X, M)
        dataloader = DataLoader(
            dataset, sampler=RandomSampler(dataset),
            batch_size=self.batch_size,
        )


        best_loss = 1e9
        best_model = copy.deepcopy(self.model)
        for epoch in range(self.max_epochs):
            self.model.train()

            optimizer.zero_grad()
            total_loss = 0
            lbl_loss = 0.
            
            
            import time
            dbgt1 = 0
            dbgt2 = 0
            dbgt3 = 0

            iter = 0
            for iter, (samples, masks) in enumerate(dataloader):

                # we use a per iteration (instead of per epoch) lr scheduler
                if iter % self.accum_iter == 0:
                    adjust_learning_rate(optimizer, iter / len(dataloader) + epoch, self.lr, self.min_lr,
                                         self.max_epochs, self.warmup_epochs)

                samples = samples.unsqueeze(dim=1)
                samples = samples.to(device, non_blocking=True)
                masks = masks.to(device, non_blocking=True)

                # print(samples, masks)

                # with torch.cuda.amp.autocast():

                if lbl_cols is not None:
                    input_samples = samples.clone()
                    input_masks = masks.clone()
                    
                    
                    # selfsl_loss, dbg_selfsl_loss = self.model.forward_selfsl(samples, masks, mask_ratio=self.mask_ratio)
                    
                    # sl_loss, dbg_sl_loss = self.model.forward_sl(input_samples, input_masks, lbl_cols)
                    
                    
                    hard = True if (epoch >= (self.max_epochs // 2)) else False
                    semisl_loss, dbg_semisl_loss = self.model.forward_semisl(input_samples, input_masks, lbl_cols, ema_model=self.ema_model, hard=hard)
                    loss = 1.0 * semisl_loss
                        
                else:

                    selfsl_loss, dbg_selfsl_loss = self.model.forward_selfsl(samples, masks, mask_ratio=self.mask_ratio)
                    loss = selfsl_loss
                
                loss_value = loss.item()
                total_loss += loss_value
                if not math.isfinite(loss_value):
                    print("Loss is {}, stopping training".format(loss_value))
                    dbg_var = (samples, masks)
                    sys.exit(1)

                loss /= self.accum_iter
                loss_scaler(loss, optimizer, parameters=self.model.parameters(),
                            update_grad=(iter + 1) % self.accum_iter == 0)

                if (iter + 1) % self.accum_iter == 0:
                    optimizer.zero_grad()
                    
                
            update_ema_variables(self.model, self.ema_model, self.ema_decay, epoch, self.max_epochs)
            # print(dbgt1)
            # print(dbgt2)
            # print(dbgt3)
            total_loss = (total_loss / (iter + 1))
            self.model.eval()
            if X_val is not None:
                val_loss = self.evaluate(X_val, lbl_cols)
            else:
                val_loss = total_loss
            if val_loss <= best_loss:
                best_loss = val_loss
                best_model = copy.deepcopy(self.model)
            if (epoch + 1) % max(1, self.max_epochs // 10) == 0 or epoch == 0:
                lbl_loss = lbl_loss / (iter + 1)
                
                if lbl_cols is not None:
                    print("Epoch: %d, train;val;best qwk: %.4f;%.4f;%.4f, loss: %.4f, val_loss: %.4f" % 
                        (epoch+1, -self.evaluate(X_raw, lbl_cols), -val_loss, -best_loss, total_loss, val_loss)
                    )
                else:
                    print("Epoch: %d, loss: %.4f" % 
                        (epoch+1, -best_loss)
                    )
                    
                

        self.model = best_model
        print(f'Loaded best model with loss={best_loss:.4f}')
        # torch.save(self.model.state_dict(), self.path)
        return self
      
      
      
    def evaluate(self, X_raw: torch.Tensor, lbl_cols):
        keep_indices = torch.where(~X_raw[:, lbl_cols].isnan())[0]
        X_raw = X_raw[keep_indices]
        gt = X_raw[:, lbl_cols[0]].cpu().numpy().round(0).astype(int)
        X_raw[:, lbl_cols] = float('nan')
        yp = self.predict(X_raw, lbl_cols)
        yp = yp.cpu().numpy().round(0).astype(int)
        return -quadratic_weighted_kappa(gt, yp)
      
      
    def predict(self, X_raw: torch.Tensor, lbl_idx, bs=None):
        X_raw = torch.tensor(X_raw, dtype=torch.float32)
        
        # Normalize the input data
        min_val = self.norm_parameters["min"]
        max_val = self.norm_parameters["max"]
        X = X_raw.clone()
        for i in range(X.shape[1]):
            X[:, i] = (X[:, i] - min_val[i]) / (max_val[i] - min_val[i] + eps)
        
        M = (1 - (1 * torch.isnan(X))).float().to(device)
        
        X = torch.nan_to_num(X)
        X = X.to(device)
        
        if bs == None: bs = self.batch_size
        # Prepare DataLoader
        dataset = MAEDataset(X, M)
        dataloader = DataLoader(dataset, batch_size=bs, shuffle=False)
        
        # Ensure model is in evaluation mode
        self.model.eval()
        
        # Tensor to hold predictions
        predictions = torch.zeros(0).to(device)
        
        with torch.no_grad():
            for batch_samples, batch_masks in dataloader:
                # Prepare input for the model
                batch_samples = batch_samples.unsqueeze(dim=1).to(device)
                batch_masks = batch_masks.to(device)
                
                # Forward pass with training=False
                pred = self.model.forward(batch_samples, batch_masks)
                
                pred = pred.reshape(-1)
                
                predictions = torch.cat((predictions, pred), 0)
        
        return predictions * 3.

    
    def fit_transform(self, X: torch.Tensor) -> torch.Tensor:
        """Imputes the provided dataset using the GAIN strategy.
        Args:
            X: np.ndarray
                A dataset with missing values.
        Returns:
            Xhat: The imputed dataset.
        """
        X = torch.tensor(X.values, dtype=torch.float32)
        return self.fit(X).transform(X).detach().cpu().numpy()

## Load training tensors

In [None]:
full_df = pd.concat([train,test])
X_raw = torch.tensor(full_df.to_numpy()).float()
print(X_raw.shape)

def random_extend(arr, k):
    indices = np.concatenate([np.random.permutation(len(arr)) for _ in range(10)])[:k]
    return arr[indices]

random_extend(X_raw, 5000).shape

for c in full_df.columns:
    if c not in test.columns:
        test[c] = float('nan')

test = test[full_df.columns]

X_tensor_test = torch.tensor(test.to_numpy()).float()


def get_model_size(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    model_size = sum([np.prod(p.size()) for p in model_parameters])
    return "{}K".format(round(model_size / 1e1) / 1e2)
  

## Training function

In [None]:

from sklearn.base import clone

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

from tqdm import tqdm
from sklearn.model_selection import KFold


def random_extend(arr, k):
    indices = np.concatenate([np.random.permutation(len(arr)) for _ in range(10)])[:k]
    return arr[indices]


num_folds = 5

def PerformImpute(imputer_args):
    global X_raw, X_tensor_test, num_folds
    train_S = []
    test_S = []
    
    KF = KFold(n_splits=num_folds, shuffle=True, random_state=SEED)

    oof_non_rounded = []
    oof_rounded = []
    oof_gt = []
    test_preds = np.zeros((len(X_tensor_test), num_folds))
    
    
    lbl_cols = [full_df.columns.get_loc(c) for c in full_df.columns if 'PCIAT' in c or 'sii' in c]
    lbl_idx = lbl_cols[0]
    
    lbled_indices = torch.where(~X_raw[:, lbl_idx].isnan())[0]
    
    
    
    X_raw_no_lbl = X_raw.clone()
    X_raw_no_lbl[:, lbl_cols] = float('nan')
    
    pretrain_args = copy.deepcopy(imputer_args)
    pretrain_args.max_epochs = pretrain_args.pretrain_epochs
    import time
    pretrain_tick = time.time()
    imputer_pretrain = ReMasker(pretrain_args)
    imputer_pretrain.fit(X_raw_no_lbl, None, None, None)
    ellapsed_time = time.time() - pretrain_tick
    print(f"Pretrained in {ellapsed_time:.4f}s.")
    
    pretrain_model = imputer_pretrain.model

    

    pbar = tqdm(KF.split(lbled_indices), desc="Training Folds", total=n_splits)    

    for fold, (train_idx_idx, test_idx_idx) in enumerate(pbar):
        train_idx = lbled_indices[train_idx_idx]
        test_idx = lbled_indices[test_idx_idx]

        X_train = X_raw.clone()
        
        
        
        X_train[test_idx.unsqueeze(1), lbl_cols] = float('nan')
        
        X_val = X_raw[test_idx].clone()

        

        X_train = random_extend(X_train, 9000)
        # X_val = random_extend(X_val, 2000)
        
    
        train_nonna_indices = torch.where(~X_train[:, lbl_idx].isnan())[0]
        val_nonna_indices = torch.where(~X_val[:, lbl_idx].isnan())[0]
        if len(train_nonna_indices) == 0 or len(val_nonna_indices)==0: continue
    
        imputer = ReMasker(imputer_args)
        imputer.fit(X_train, X_val, lbl_cols, pretrain_model)

        y_train_ = X_train[train_nonna_indices, lbl_idx].numpy().astype(int)
        y_val_ = X_val[val_nonna_indices, lbl_idx].numpy().astype(int)

        X_train[:, lbl_cols] = float('nan')
        X_val[:, lbl_cols] = float('nan')
        
        y_train_pred = imputer.predict(X_train[train_nonna_indices], lbl_cols).cpu().detach().numpy()
        y_val_pred = imputer.predict(X_val[val_nonna_indices], lbl_cols).cpu().detach().numpy()
        y_test_pred = imputer.predict(X_tensor_test, lbl_cols).cpu().detach().numpy()

        # model = clone(model_init)

        # model.fit(X_train_, y_train_)

        oof_non_rounded += [y_val_pred]
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded += [y_val_pred_rounded]
        oof_gt += [y_val_]
        
        train_kappa = quadratic_weighted_kappa(y_train_, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val_, y_val_pred.round(0).astype(int))

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        
        test_preds[:, fold] = y_test_pred

        pbar.set_description_str(
          "Fold %d, Train MSE: %.4f, Val MSE: %.4f, Train QWK: %.4f, Val QWK: %.4f" % (
              fold + 1,
              ((y_train_pred - y_train_) ** 2 / 9.).mean(),
              ((y_val_pred - y_val_) ** 2 / 9.).mean(),
              train_kappa,
              val_kappa
          )
        )
    
    
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    oof_non_rounded = np.concatenate(oof_non_rounded)
    oof_gt = np.concatenate(oof_gt)
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(oof_gt, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(oof_gt, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")


    # return 0.5 * (np.mean(test_S) + tKappa)
  
  
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    sample_sub_df = pd.read_csv('../input/child-mind-institute-problematic-internet-use/sample_submission.csv')
    submission = pd.DataFrame({
        'id': sample_sub_df['id'],
        'sii': tpTuned
    })

    return submission



## Run submission

In [None]:
imputer_args = Namespace(
    batch_size=64,
    max_epochs= 50,
    pretrain_epochs=400,
    accum_iter=1,
    mask_ratio=0.75,
    embed_dim=6,
    depth=8,
    decoder_depth=1,
    num_heads=3,
    mlp_ratio=21.5,
    cls_mlp_dim=48,
    dropout=0.5,
    encode_func='linear',
    norm_field_loss=False,
    ema_decay=0.9,
    weight_decay=0.05,
    lr=None, blr=0.001,
    min_lr=1e-05,
    device='cuda', seed=SEED, overwrite=True, pin_mem=True
)
print(imputer_args)



SEED = random.randint(1, int(2e9))
np.random.seed(SEED)
indices = np.random.permutation(len(X_raw))
X_raw = X_raw[indices]
submission6 = PerformImpute(imputer_args)
submission6

# Final majority voting

In [None]:
sub1 = submission1
sub2 = submission2
sub3 = submission3
sub4 = submission4
sub5 = submission5
sub6 = submission6

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)
sub4 = sub4.sort_values(by='id').reset_index(drop=True)
sub5 = sub5.sort_values(by='id').reset_index(drop=True)
sub6 = sub6.sort_values(by='id').reset_index(drop=True)


combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii'],
    'sii_4': sub4['sii'],
    'sii_5': sub5['sii'],
    'sii_6': sub6['sii'],
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3', 'sii_4', 'sii_5', 'sii_6']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")