# **Artificial Neural Networks and Deep Learning**

---

## ‚öôÔ∏è **Libraries Import**

In [21]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [22]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
import torch.nn.functional as F

logs_dir = "tensorboard"
!pkill -f tensorboard
%load_ext tensorboard
!mkdir -p models

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import copy
import shutil
from datetime import datetime
from itertools import product
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import pickle

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
PyTorch version: 2.8.0+cu126
Device: cuda


## ‚è≥ **Data Loading**

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import os


In [25]:
DATASET_ROOT = Path("./dataset")

# --- 2Ô∏è‚É£ Kaggle ---
DATASET_ROOT = Path("/content/drive/MyDrive/pirate-pain")

# --- 3Ô∏è‚É£ Server o cluster privato (es. Westworld/Elysium) ---
# DATASET_ROOT = Path("/multiverse/datasets/private_dataset/pirate_pain")


In [26]:
# Create output directory for models
output_dir = './ensemble_models'
os.makedirs(output_dir, exist_ok=True)

In [27]:
import pandas as pd

# Caricamento dati
X_train = pd.read_csv(DATASET_ROOT / "pirate_pain_train.csv")
X_TRAIN = pd.read_csv(DATASET_ROOT / "pirate_pain_train.csv")

y_train = pd.read_csv(DATASET_ROOT / "pirate_pain_train_labels.csv")
Y_TRAIN = pd.read_csv(DATASET_ROOT / "pirate_pain_train_labels.csv")

X_test  = pd.read_csv(DATASET_ROOT / "pirate_pain_test.csv")

print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape}")

  X_train: (105760, 40)
  y_train: (661, 2)
  X_test:  (211840, 40)


In [28]:
def dataset_conversion_type_embed_ready(df):
    """
    Minimal, embedding-friendly preprocessing:
    - joints: float32 (continuous features)
    - pain_survey_*: int64 indices (0..2) for embeddings
    - n_legs/hands/eyes: mapped to {0,1} as int64 for embeddings
    Returns: df, meta
    """
    df = df.copy()

    # 1) continuous features
    joint_cols = [c for c in df.columns if c.startswith("joint_")]
    df[joint_cols] = df[joint_cols].astype("float32")

    # 2) surveys as categorical indices (already 0/1/2)
    pain_survey_cols = [c for c in df.columns if c.startswith("pain_survey_")]
    df[pain_survey_cols] = df[pain_survey_cols].astype("int64")

    # 3) 2-way categoricals ‚Üí indices
    legs_map  = {"two": 0, "one+peg_leg": 1}
    hands_map = {"two": 0, "one+hook_hand": 1}
    eyes_map  = {"two": 0, "one+eye_patch": 1}

    if "n_legs" in df.columns:
        df["n_legs"]  = df["n_legs"].map(legs_map).astype("int64")
    if "n_hands" in df.columns:
        df["n_hands"] = df["n_hands"].map(hands_map).astype("int64")
    if "n_eyes" in df.columns:
        df["n_eyes"]  = df["n_eyes"].map(eyes_map).astype("int64")

    # 4) define columns
    cat_two_cols = [c for c in ["n_legs","n_hands","n_eyes"] if c in df.columns]
    cat_cols = pain_survey_cols + cat_two_cols
    cont_cols = joint_cols  # keep only joints as continuous

    # 5) cardinals for embeddings (compute on TRAIN ONLY in CV, reuse for VAL/TEST)
    cardinals = {c: int(df[c].nunique()) for c in cat_cols}
    # suggested tiny dims: 1 for binaries, 2 for 3-class surveys
    emb_dims = {c: (1 if cardinals[c] == 2 else 2) for c in cat_cols}

    meta = {
        "cont_cols": cont_cols,
        "cat_cols":  cat_cols,
        "cardinals": cardinals,
        "emb_dims":  emb_dims,
        "maps": {"n_legs": legs_map, "n_hands": hands_map, "n_eyes": eyes_map},
    }
    return df, meta


In [29]:
def preprocess_joints(df,
                      drop_redundant=False,
                      drop_near_zero=False,
                      drop_low_var=False,
                      verbose=True):
    """
    Simplify joint_* preprocessing based on EDA results.
    Removes constant, redundant, or near-zero-variance joints.

    Returns a (df_out, feature_cols) tuple.
    """
    joint_cols = sorted([c for c in df.columns if c.startswith("joint_")],
                        key=lambda x: int(x.split("_")[1]))
    drop = set()

    # 1 Drop constant joint_30
    if "joint_30" in joint_cols:
        drop.add("joint_30")

    if "joint_11" in joint_cols:
        drop.add("joint_11")

    #  Drop redundant joints (from correlation heatmap)
    if drop_redundant:
        for c in ["joint_01", "joint_02", "joint_05"]:
            if c in joint_cols:
                drop.add(c)

    # Drop near-zero variance joints (joint_13‚Äì25)
    if drop_near_zero:
        for i in range(13, 26):
            c = f"joint_{i:02d}"
            if c in joint_cols:
                drop.add(c)

    # (Optional) Drop low-variance but not-zero joints (joint_26‚Äì29)
    if drop_low_var:
        for i in range(26, 30):
            c = f"joint_{i:02d}"
            if c in joint_cols:
                drop.add(c)

    # apply
    kept = [c for c in joint_cols if c not in drop]
    df_out = df.drop(columns=list(drop), errors="ignore")

    if verbose:
        print(f"[preprocess_joints] start={len(joint_cols)} | kept={len(kept)} | dropped={len(drop)}")
        if drop:
            print("  ‚Ä¢ dropped:", sorted(list(drop)))

    return df_out, kept


## üîÑ **Data Preprocessing**

In [30]:
def build_sequences(
    df: pd.DataFrame,
    y: pd.DataFrame | np.ndarray | None = None,
    window: int | None = None,
    stride: int | None = None,
    pad: bool = False,
    add_time_features: bool = True
):
    """
    Build sequences from the dataset, either:
      - full-length per sample_index (when window/stride are None), or
      - sliding windows with given window and stride.

    Data assumptions for THIS notebook:
      ‚Ä¢ df already normalized/mapped (categoricals numeric; e.g., n_legs/hands/eyes ‚àà {0,1})
      ‚Ä¢ df has columns: ['sample_index','time', joint_*, pain_survey_*, n_legs, n_hands, n_eyes]
      ‚Ä¢ each sample_index has T=160 rows (fixed-length), but we still allow windowing/stride

    Returns:
        dataset: np.ndarray of shape (N,T,F) or (N,window,F)
        labels:  np.ndarray of shape (N,) if y is provided, else None
    """
    # ------------------------------------------------------------------
    # Feature groups (already numeric at this stage)
    joint_cols  = [c for c in df.columns if c.startswith('joint_')]
    pain_cols   = [c for c in df.columns if c.startswith('pain_survey_')]
    static_cols = [c for c in ['n_legs', 'n_hands', 'n_eyes'] if c in df.columns]

    # Keep only the necessary columns in a copy; preserve order
    cols_needed = ['sample_index', 'time'] + joint_cols + pain_cols + static_cols
    df = df[cols_needed].copy()

    # Sort to preserve chronological order within each sequence
    df = df.sort_values(["sample_index", "time"])

    # If labels are provided, build a lookup dictionary: sample_index ‚Üí label
    label_dict = None
    if y is not None:
        if isinstance(y, np.ndarray):
            # Build mapping using the unique order of sample_index in df
            unique_ids = df["sample_index"].unique()
            label_dict = {sid: int(lbl) for sid, lbl in zip(unique_ids, y)}
        elif isinstance(y, pd.DataFrame):
            # Expect columns ['sample_index','label'] with already-int-mapped labels
            label_dict = dict(zip(y["sample_index"], y["label"]))

    # Prepare outputs
    dataset = []
    labels  = []

    # If no window/stride provided ‚Üí fall back to full-length per sequence
    full_length_mode = (window is None or stride is None)

    # Iterate over each sequence
    for sid, group in df.groupby("sample_index", sort=False):
        # --- Extract groups (preserve types for embeddings) ---
        X_joints = group[joint_cols].to_numpy(dtype=np.float32)        # (T, J) - continuous features

        # IMPORTANT: Pain survey features are categorical indices {0,1,2}
        # Keep as int64 first, then convert to float32 to preserve exact integer values
        X_pain = group[pain_cols].to_numpy(dtype=np.int64)             # (T, 4) - categorical indices
        X_pain = X_pain.astype(np.float32)                              # Convert to float32 but keep 0.0, 1.0, 2.0

        # IMPORTANT: Static features are categorical indices {0,1}
        # Keep as int64 first, then convert to float32 to preserve exact integer values
        if static_cols:
            X_static = group[static_cols].to_numpy(dtype=np.int64)     # (T, 3) - categorical indices
            X_static = X_static.astype(np.float32)                      # Convert to float32 but keep 0.0, 1.0
        else:
            X_static = None


        # Time features: extract normalized time + sinusoidal encoding
        if add_time_features:
            time_values = group['time'].to_numpy(dtype=np.float32)
            max_time = time_values.max()
            normalized_time = time_values / max_time if max_time > 0 else time_values
            time_sin = np.sin(2 * np.pi * normalized_time)
            time_cos = np.cos(2 * np.pi * normalized_time)
            X_time = np.stack([normalized_time, time_sin, time_cos], axis=1)  # (T, 3)
        else:
            X_time = None

        # Concatenate all feature groups along last dimension
        if X_static is not None:
            X_full = np.concatenate([X_joints, X_pain, X_static], axis=1)  # (T, F_total)
        else:
            X_full = np.concatenate([X_joints, X_pain], axis=1)            # (T, F_total)

        # Add time features if enabled
        if X_time is not None:
            X_full = np.concatenate([X_full, X_time], axis=1)              # (T, F_total + 3)

        T = X_full.shape[0]

        if full_length_mode:
            # ----- FULL-LENGTH MODE -----
            dataset.append(X_full)
            if label_dict is not None and sid in label_dict:
                labels.append(int(label_dict[sid]))
        else:
            # ----- WINDOWED MODE (window, stride) -----
            W = int(window)
            S = int(stride)
            assert W > 0 and S > 0, "window and stride must be positive integers"

            if pad and T % W != 0:
                # pad at the end with zeros to allow the last partial window
                pad_len = (W - (T % W)) % W
                if pad_len > 0:
                    X_pad = np.zeros((pad_len, X_full.shape[1]), dtype=np.float32)
                    X_seq = np.concatenate([X_full, X_pad], axis=0)
                else:
                    X_seq = X_full
                Tmax = X_seq.shape[0]
                idx = 0
                while idx + W <= Tmax:
                    dataset.append(X_seq[idx:idx+W])
                    if label_dict is not None and sid in label_dict:
                        labels.append(int(label_dict[sid]))
                    idx += S
            else:
                # no padding ‚Üí only windows fully inside the sequence
                idx = 0
                while idx + W <= T:
                    dataset.append(X_full[idx:idx+W])
                    if label_dict is not None and sid in label_dict:
                        labels.append(int(label_dict[sid]))
                    idx += S

    # Convert to numpy arrays
    dataset = np.asarray(dataset, dtype=np.float32) if len(dataset) > 0 else np.empty((0, 0, 0), dtype=np.float32)
    labels  = np.asarray(labels,  dtype=np.int64)   if len(labels)  > 0 else None

    if dataset.size > 0:
        print(f"Built {len(dataset)} sequence{'s' if len(dataset)!=1 else ''}; each shape = {dataset[0].shape}")
    else:
        print("Built 0 sequences (check window/stride vs sequence length).")

    return dataset, labels


In [31]:

def make_loader(ds, batch_size, shuffle, drop_last, sampler=None):
    # Determine optimal number of worker processes for data loading
    cpu_cores = os.cpu_count() or 2
    num_workers = max(2, min(4, cpu_cores))

    final_shuffle = shuffle
    if sampler is not None:
        final_shuffle = False

    # Create DataLoader with performance optimizations
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=final_shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        sampler=sampler,
        pin_memory=True,  # Faster GPU transfer
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=4,  # Load 4 batches aheads
    )

## üõ†Ô∏è **Model Building**

In [32]:
def recurrent_summary(model, input_size):
    """
    Custom summary function that emulates torchinfo's output while correctly
    counting parameters for RNN/GRU/LSTM layers.

    This function is designed for models whose direct children are
    nn.Linear, nn.RNN, nn.GRU, or nn.LSTM layers.

    Args:
        model (nn.Module): The model to analyze.
        input_size (tuple): Shape of the input tensor (e.g., (seq_len, features)).
    """

    # Dictionary to store output shapes captured by forward hooks
    output_shapes = {}
    # List to track hook handles for later removal
    hooks = []

    def get_hook(name):
        """Factory function to create a forward hook for a specific module."""
        def hook(module, input, output):
            # Handle RNN layer outputs (returns a tuple)
            if isinstance(output, tuple):
                # output[0]: all hidden states with shape (batch, seq_len, hidden*directions)
                shape1 = list(output[0].shape)
                shape1[0] = -1  # Replace batch dimension with -1

                # output[1]: final hidden state h_n (or tuple (h_n, c_n) for LSTM)
                if isinstance(output[1], tuple):  # LSTM case: (h_n, c_n)
                    shape2 = list(output[1][0].shape)  # Extract h_n only
                else:  # RNN/GRU case: h_n only
                    shape2 = list(output[1].shape)

                # Replace batch dimension (middle position) with -1
                shape2[1] = -1

                output_shapes[name] = f"[{shape1}, {shape2}]"

            # Handle standard layer outputs (e.g., Linear)
            else:
                shape = list(output.shape)
                shape[0] = -1  # Replace batch dimension with -1
                output_shapes[name] = f"{shape}"
        return hook

    # 1. Determine the device where model parameters reside
    try:
        device = next(model.parameters()).device
    except StopIteration:
        device = torch.device("cpu")  # Fallback for models without parameters

    # 2. Create a dummy input tensor with batch_size=1
    dummy_input = torch.randn(1, *input_size).to(device)

    # 3. Register forward hooks on target layers
    # Iterate through direct children of the model (e.g., self.rnn, self.classifier)
    for name, module in model.named_children():
        if isinstance(module, (nn.Linear, nn.RNN, nn.GRU, nn.LSTM)):
            # Register the hook and store its handle for cleanup
            hook_handle = module.register_forward_hook(get_hook(name))
            hooks.append(hook_handle)

    # 4. Execute a dummy forward pass in evaluation mode
    model.eval()
    with torch.no_grad():
        try:
            model(dummy_input)
        except Exception as e:
            print(f"Error during dummy forward pass: {e}")
            # Clean up hooks even if an error occurs
            for h in hooks:
                h.remove()
            return

    # 5. Remove all registered hooks
    for h in hooks:
        h.remove()

    # --- 6. Print the summary table ---

    print("-" * 79)
    # Column headers
    print(f"{'Layer (type)':<25} {'Output Shape':<28} {'Param #':<18}")
    print("=" * 79)

    total_params = 0
    total_trainable_params = 0

    # Iterate through modules again to collect and display parameter information
    for name, module in model.named_children():
        if name in output_shapes:
            # Count total and trainable parameters for this module
            module_params = sum(p.numel() for p in module.parameters())
            trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad)

            total_params += module_params
            total_trainable_params += trainable_params

            # Format strings for display
            layer_name = f"{name} ({type(module).__name__})"
            output_shape_str = str(output_shapes[name])
            params_str = f"{trainable_params:,}"

            print(f"{layer_name:<25} {output_shape_str:<28} {params_str:<15}")

    print("=" * 79)
    print(f"Total params: {total_params:,}")
    print(f"Trainable params: {total_trainable_params:,}")
    print(f"Non-trainable params: {total_params - total_trainable_params:,}")
    print("-" * 79)

In [33]:
class RecurrentClassifier(nn.Module):
    def __init__(
            self,
            input_size,
            hidden_size,
            num_layers,
            num_classes,
            rnn_type='GRU',
            bidirectional=False,
            dropout_rate=0.2,
            rec_dropout_rate=None,
            cnn_channels=None,
            cnn_kernel_size=3,
            cnn_dropout=None,
            use_pain_embeddings=True,
            pain_embedding_dim=4,
            num_joint_features=29,
            num_pain_features=4,
            num_static_features=3,
            num_time_features=3,
            use_attention=True
            ):
        super().__init__()
        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.uses_cnn = cnn_channels is not None
        self.use_pain_embeddings = use_pain_embeddings
        self.use_attention = use_attention
        # ---------------------------------------------
        # Store feature split indices (same as before)
        self.num_joint_features = num_joint_features
        self.num_pain_features = num_pain_features
        self.num_static_features = num_static_features
        self.num_time_features = num_time_features
        self.joint_end = num_joint_features
        self.pain_end = self.joint_end + num_pain_features
        self.static_end = self.pain_end + num_static_features
        # ---------------------------------------------
        # Check input size
        expected_input = num_joint_features + num_pain_features + num_static_features + num_time_features
        if input_size != expected_input:
            print(f"WARNING: input_size={input_size} but expected {expected_input}")
        # ---------------------------------------------
        # Embeddings (if enabled)
        if self.use_pain_embeddings:
            self.pain_embeddings = nn.ModuleList([
                nn.Embedding(num_embeddings=3, embedding_dim=pain_embedding_dim)
                for _ in range(num_pain_features)
            ])
            effective_input_size = (num_joint_features + num_pain_features * pain_embedding_dim + num_static_features + num_time_features)
        else:
            effective_input_size = input_size
        # ---------------------------------------------
        # Optionally CNN block
        if self.uses_cnn:
            self.cnn1 = nn.Conv1d(in_channels=effective_input_size,
                                  out_channels=cnn_channels,
                                  kernel_size=cnn_kernel_size,
                                  padding=cnn_kernel_size // 2)
            self.cnn_bn = nn.BatchNorm1d(cnn_channels)
            self.cnn_act = nn.ReLU()
            self.cnn_dropout = nn.Dropout(cnn_dropout if cnn_dropout is not None else dropout_rate)
            rnn_input_size = cnn_channels
        else:
            rnn_input_size = effective_input_size
        # ---------------------------------------------
        # RNN
        rnn_map = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}
        rnn_module = rnn_map[rnn_type]
        dropout_val = dropout_rate if num_layers > 1 else 0
        self.rnn = rnn_module(
            input_size=rnn_input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout_val
        )
        # ---------------------------------------------
        # Classifier layer (after attention output)
        rnn_output_size = hidden_size * (2 if bidirectional else 1)
        self.classifier = nn.Linear(rnn_output_size, num_classes)
        self.rec_dropout = nn.Dropout(rec_dropout_rate) if rec_dropout_rate is not None else None

    def scaled_dot_product_attention(self, query, key, value):
        """
        Args:
            query: (batch, 1, d)
            key, value: (batch, seq_len, d)
        Computes attention weights and context.
        """
        # Compute dot products
        scores = torch.matmul(query, key.transpose(1,2))  # (batch, 1, seq_len)
        scores = scores / torch.sqrt(torch.tensor(query.size(-1), dtype=torch.float32, device=query.device))
        attn_weights = F.softmax(scores, dim=-1)          # (batch, 1, seq_len)
        context = torch.matmul(attn_weights, value)        # (batch, 1, d)
        context = context.squeeze(1)                      # (batch, d)
        return context, attn_weights.squeeze(1)

    def forward(self, x):
        # --- Embedding and feature split ---
        if self.use_pain_embeddings:
            X_joints = x[:, :, :self.joint_end]
            X_pain = x[:, :, self.joint_end:self.pain_end]
            X_static = x[:, :, self.pain_end:self.static_end]
            X_time = x[:, :, self.static_end:]
            pain_embedded_list = [self.pain_embeddings[i](X_pain[:,:,i].long()) for i in range(self.num_pain_features)]
            X_pain_embedded = torch.cat(pain_embedded_list, dim=-1)
            x = torch.cat([X_joints, X_pain_embedded, X_static, X_time], dim=-1)
        # --- Optionally CNN preprocessing ---
        if self.uses_cnn:
            x = x.transpose(1, 2)
            x = self.cnn1(x)
            x = self.cnn_bn(x)
            x = self.cnn_act(x)
            x = self.cnn_dropout(x)
            x = x.transpose(1, 2)
        # --- RNN ---
        rnn_out, hidden = self.rnn(x)  # rnn_out: (batch, seq_len, hidden_size * directions)

        # Use last hidden state as query for attention
        if self.rnn_type == 'LSTM':
            hidden = hidden[0]
        if self.bidirectional:
            hidden = hidden.view(self.num_layers, 2, -1, self.hidden_size)
            final_hidden = torch.cat([hidden[-1,0,:,:], hidden[-1,1,:,:]], dim=1)  # (batch, rnn_output_size)
        else:
            final_hidden = hidden[-1]  # (batch, rnn_output_size)
        # --- Attention: query=final_hidden, key/value=all rnn_out ---
        if self.use_attention:
            query = final_hidden.unsqueeze(1)  # (batch, 1, rnn_output_size)
            context, attn_weights = self.scaled_dot_product_attention(query, rnn_out, rnn_out)
        else:
            context = final_hidden

        # --- Dropout and classification ---
        if self.rec_dropout is not None:
            context = self.rec_dropout(context)
        logits = self.classifier(context)  # (batch, num_classes)
        return logits  # Optionally also return attn_weights if you want visualization


## üß† **Model Training**

In [34]:
def log_metrics_to_tensorboard(writer, epoch, train_loss, train_f1, val_loss, val_f1, model):
    """
    Log training metrics and model parameters to TensorBoard for visualization.

    Args:
        writer (SummaryWriter): TensorBoard SummaryWriter object for logging
        epoch (int): Current epoch number (used as x-axis in TensorBoard plots)
        train_loss (float): Training loss for this epoch
        train_f1 (float): Training f1 score for this epoch
        val_loss (float): Validation loss for this epoch
        val_f1 (float): Validation f1 score for this epoch
        model (nn.Module): The neural network model (for logging weights/gradients)

    Note:
        This function logs scalar metrics (loss/f1 score) and histograms of model
        parameters and gradients, which helps monitor training progress and detect
        issues like vanishing/exploding gradients.
    """
    # Log scalar metrics
    writer.add_scalar('Loss/Training', train_loss, epoch)
    writer.add_scalar('Loss/Validation', val_loss, epoch)
    writer.add_scalar('F1/Training', train_f1, epoch)
    writer.add_scalar('F1/Validation', val_f1, epoch)

    # Log model parameters and gradients
    for name, param in model.named_parameters():
        if param.requires_grad:
            # Check if the tensor is not empty before adding a histogram
            if param.numel() > 0:
                writer.add_histogram(f'{name}/weights', param.data, epoch)
            if param.grad is not None:
                # Check if the gradient tensor is not empty before adding a histogram
                if param.grad.numel() > 0:
                    if param.grad is not None and torch.isfinite(param.grad).all():
                        writer.add_histogram(f'{name}/gradients', param.grad.data, epoch)

In [35]:
# Initialize best model tracking variables
best_model = None
best_performance = float('-inf')

In [36]:
def train_one_epoch(model, train_loader, criterion, optimizer, scaler,
                    device, l1_lambda=0, l2_lambda=0,max_grad_norm=1.0):
    """
    Perform one complete training epoch through the entire training dataset.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss, MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): Lambda for L1 regularization
        l2_lambda (float): Lambda for L2 regularization

    Returns:
        tuple: (average_loss, f1 score) - Training loss and f1 score for this epoch
    """
    model.train()  # Set model to training mode

    running_loss = 0.0
    all_predictions = []
    all_targets = []

    # Iterate through training batches
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Move data to device (GPU/CPU)
        inputs, targets = inputs.to(device), targets.to(device)

        # Clear gradients from previous step
        optimizer.zero_grad(set_to_none=True)

        # Forward pass with mixed precision (if CUDA available)
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(inputs)
            loss = criterion(logits, targets)

            # Add L1 and L2 regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            l2_norm = sum(p.pow(2).sum() for p in model.parameters())
            loss = loss + l1_lambda * l1_norm + l2_lambda * l2_norm


        # Backward pass with gradient scaling
        if scaler is not None and device.type == 'cuda':
            scaler.scale(loss).backward()            # grads are scaled
            scaler.unscale_(optimizer)               # unscale to true grad values
            torch.nn.utils.clip_grad_norm_(          # CLIP true gradients (magnitude cap)
                model.parameters(), max_norm=max_grad_norm
            )
            scaler.step(optimizer)                   # safe optimizer.step() (skips on inf/NaN)
            scaler.update()                          # update scaling factor
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_grad_norm)
            optimizer.step()

        # Accumulate metrics
        running_loss += loss.item() * inputs.size(0)
        predictions = logits.argmax(dim=1)
        all_predictions.append(predictions.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

    # Calculate epoch metrics
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_f1 = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='macro'
    )

    return epoch_loss, epoch_f1

In [37]:
def validate_one_epoch(model, val_loader, criterion, device):
    """
    Perform one complete validation epoch through the entire validation dataset.

    Args:
        model (nn.Module): The neural network model to evaluate (must be in eval mode)
        val_loader (DataLoader): PyTorch DataLoader containing validation data batches
        criterion (nn.Module): Loss function used to calculate validation loss
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)

    Returns:
        tuple: (average_loss, accuracy) - Validation loss and accuracy for this epoch

    Note:
        This function automatically sets the model to evaluation mode and disables
        gradient computation for efficiency during validation.
    """
    model.eval()  # Set model to evaluation mode

    running_loss = 0.0
    all_predictions = []
    all_targets = []

    # Disable gradient computation for validation
    with torch.no_grad():
        for inputs, targets in val_loader:
            # Move data to device
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass with mixed precision (if CUDA available)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(inputs)
                loss = criterion(logits, targets)

            # Accumulate metrics
            running_loss += loss.item() * inputs.size(0)
            predictions = logits.argmax(dim=1)
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    # Calculate epoch metrics
    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_accuracy = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='macro'
    )

    return epoch_loss, epoch_accuracy

In [38]:
def fit(model, train_loader, val_loader, epochs, train_criterion, val_criterion, optimizer, scaler, device,
        l1_lambda=0, l2_lambda=0, patience=0, scheduler=None, # Added scheduler parameter
        evaluation_metric="val_f1", mode='max',
        restore_best_weights=True, writer=None, verbose=1, experiment_name=""):
    """
    Train the neural network model on the training data and validate on the validation data.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        val_loader (DataLoader): PyTorch DataLoader containing validation data batches
        epochs (int): Number of training epochs
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss, MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): L1 regularization coefficient (default: 0)
        l2_lambda (float): L2 regularization coefficient (default: 0)
        patience (int): Number of epochs to wait for improvement before early stopping (default: 0)
        evaluation_metric (str): Metric to monitor for early stopping (default: "val_f1")
        mode (str): 'max' for maximizing the metric, 'min' for minimizing (default: 'max')
        restore_best_weights (bool): Whether to restore model weights from best epoch (default: True)
        writer (SummaryWriter, optional): TensorBoard SummaryWriter object for logging (default: None)
        verbose (int, optional): Frequency of printing training progress (default: 10)
        experiment_name (str, optional): Experiment name for saving models (default: "")

    Returns:
        tuple: (model, training_history) - Trained model and metrics history
    """

    # Initialize metrics tracking
    training_history = {
        'train_loss': [], 'val_loss': [],
        'train_f1': [], 'val_f1': []
    }

    # Configure early stopping if patience is set
    if patience > 0:
        patience_counter = 0
        best_metric = float('-inf') if mode == 'max' else float('inf')
        best_epoch = 0

    print(f"Training {epochs} epochs...")

    # Main training loop: iterate through epochs
    for epoch in range(1, epochs + 1):

        # Forward pass through training data, compute gradients, update weights
        train_loss, train_f1 = train_one_epoch(
            model, train_loader, train_criterion, optimizer, scaler, device
        )

        # Evaluate model on validation data without updating weights
        if val_loader is not None:
            val_loss, val_f1 = validate_one_epoch(model, val_loader, val_criterion, device)
        else:
            val_loss, val_f1 = None, None


        # Step the scheduler if provided (typically after validation)
        if scheduler is not None:
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) or isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
                scheduler.step(val_f1)
            else:
                scheduler.step()

        # Store metrics for plotting and analysis
        training_history['train_loss'].append(train_loss)
        training_history['val_loss'].append(val_loss)
        training_history['train_f1'].append(train_f1)
        training_history['val_f1'].append(val_f1)

        # Write metrics to TensorBoard for visualization
        if writer is not None:
            log_metrics_to_tensorboard(
                writer, epoch, train_loss, train_f1, val_loss, val_f1, model
            )

        # Print progress every N epochs or on first epoch
        if verbose > 0:
            if epoch % verbose == 0 or epoch == 1:
                if val_loss is not None:
                    print(f"Epoch {epoch:3d}/{epochs} | "
                          f"Train: Loss={train_loss:.4f}, F1 Score={train_f1:.4f} | "
                          f"Val: Loss={val_loss:.4f}, F1 Score={val_f1:.4f}")
                else:
                    print(f"Epoch {epoch:3d}/{epochs} | "
                          f"Train: Loss={train_loss:.4f}, F1 Score={train_f1:.4f}")


        # Early stopping logic: monitor metric and save best model
        if patience > 0 and val_loader is not None:
            current_metric = training_history[evaluation_metric][-1]
            is_improvement = (current_metric > best_metric) if mode == 'max' else (current_metric < best_metric)

            if is_improvement:
                best_metric = current_metric
                best_epoch = epoch
                torch.save(model.state_dict(), "models/"+experiment_name+'_model.pt')
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping triggered after {epoch} epochs.")
                    break


    # Restore best model weights if early stopping was used
    if restore_best_weights and patience > 0:
        model.load_state_dict(torch.load("models/"+experiment_name+'_model.pt'))
        print(f"Best model restored from epoch {best_epoch} with {evaluation_metric} {best_metric:.4f}")

    # Save final model if no early stopping
    if patience == 0:
        torch.save(model.state_dict(), "models/"+experiment_name+'_model.pt')

    if patience > 0:
        training_history['best_epoch'] = best_epoch
        training_history['best_metric'] = best_metric

    # Close TensorBoard writer
    if writer is not None:
        writer.close()

    return model, training_history

## üõ†Ô∏è **Ensembling**

In [39]:
# ============================================================
# UTILITY FUNCTIONS FOR VISUALIZATION
# ============================================================

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from collections import Counter

def plot_training_history(history, config_name, fold_idx):
    """Plot training and validation loss/F1"""
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

    # Loss plot
    ax1.plot(history['train_loss'], label='Training loss', alpha=0.3, color='#ff7f0e', linestyle='--')
    ax1.plot(history['val_loss'], label='Validation loss', alpha=0.9, color='#ff7f0e')
    ax1.set_title(f'{config_name} - Fold {fold_idx} - Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(alpha=0.3)

    # F1 plot
    ax2.plot(history['train_f1'], label='Training F1', alpha=0.3, color='#ff7f0e', linestyle='--')
    ax2.plot(history['val_f1'], label='Validation F1', alpha=0.9, color='#ff7f0e')
    ax2.set_title(f'{config_name} - Fold {fold_idx} - F1 Score')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('F1 Score')
    ax2.legend()
    ax2.grid(alpha=0.3)

    plt.tight_layout()
    plt.show()

def evaluate_and_plot_confusion_matrix(model, val_loader, X_val, window, stride, config_name, fold_idx, device):
    """Evaluate model on validation set and plot confusion matrix"""
    # Collect window-level predictions
    val_preds_windows = []
    val_targets_windows = []

    model.eval()
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            logits = model(xb)
            preds = logits.argmax(dim=1).cpu().numpy()

            val_preds_windows.append(preds)
            val_targets_windows.append(yb.numpy())

    val_preds_windows = np.concatenate(val_preds_windows)
    val_targets_windows = np.concatenate(val_targets_windows)

    # Aggregate windows to sequences (majority vote)
    n_windows_per_seq = (160 - window) // stride + 1
    unique_samples = sorted(X_val['sample_index'].unique())

    sequence_preds = {}
    sequence_targets = {}

    for idx, sid in enumerate(unique_samples):
        start_idx = idx * n_windows_per_seq
        end_idx = start_idx + n_windows_per_seq

        window_preds = val_preds_windows[start_idx:end_idx]
        window_targets = val_targets_windows[start_idx:end_idx]

        # Majority vote
        vote_counts = Counter(window_preds)
        final_pred = vote_counts.most_common(1)[0][0]

        # Sanity check
        assert len(np.unique(window_targets)) == 1, f"Sample {sid} has inconsistent labels!"
        final_target = window_targets[0]

        sequence_preds[sid] = final_pred
        sequence_targets[sid] = final_target

    # Convert to arrays
    val_preds = np.array([sequence_preds[sid] for sid in unique_samples])
    val_targets = np.array([sequence_targets[sid] for sid in unique_samples])

    # Calculate metrics
    val_acc = accuracy_score(val_targets, val_preds)
    val_prec = precision_score(val_targets, val_preds, average='macro', zero_division=0)
    val_rec = recall_score(val_targets, val_preds, average='macro', zero_division=0)
    val_f1 = f1_score(val_targets, val_preds, average='macro', zero_division=0)

    print(f"\n  {config_name} - Fold {fold_idx} Validation Metrics (Sequence-Level):")
    print(f"    Accuracy:  {val_acc:.4f}")
    print(f"    Precision: {val_prec:.4f}")
    print(f"    Recall:    {val_rec:.4f}")
    print(f"    F1 Score:  {val_f1:.4f}")

    # Plot confusion matrix
    cm = confusion_matrix(val_targets, val_preds)
    labels = np.array([f"{num}" for num in cm.flatten()]).reshape(cm.shape)

    plt.figure(figsize=(8, 7))
    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues',
                xticklabels=['no_pain', 'low_pain', 'high_pain'],
                yticklabels=['no_pain', 'low_pain', 'high_pain'])
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(f'{config_name} - Fold {fold_idx}\nConfusion Matrix ‚Äî Validation Set (Sequence-Level)')
    plt.tight_layout()
    plt.show()

    return val_f1

print("‚úì Utility functions defined")


‚úì Utility functions defined


In [40]:
ENSEMBLE_CONFIGS = [
    # 1. Nessun CNN, nessuna attention => modello grande
    {
        "name": "gru_big_no_cnn_no_att",
        "seed": 101,
        "lr": 3e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 20,
        "hidden_layers": 3,
        "hidden_size": 256,
        "dropout": 0.4,
        "rec_dropout": 0.25,
        "l2": 3e-5,
        "window": 8,
        "stride": 2,
        "rnn_type": "GRU",
        "bidirectional": True,
        "cnn_channels": None,
        "cnn_kernel": 3,
        "cnn_dropout": None,
        "use_attention": False,
    },
    {
        "name": "lstm_big_no_cnn_no_att",
        "seed": 102,
        "lr": 3e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 20,
        "hidden_layers": 3,
        "hidden_size": 256,
        "dropout": 0.4,
        "rec_dropout": 0.25,
        "l2": 3e-5,
        "window": 8,
        "stride": 2,
        "rnn_type": "LSTM",
        "bidirectional": True,
        "cnn_channels": None,
        "cnn_kernel": 3,
        "cnn_dropout": None,
        "use_attention": False,
    },

    # 2. CNN ma senza attention => modello medio-grande con 2 strati e 256 neuroni
    {
        "name": "gru_medium_large_cnn_no_att",
        "seed": 103,
        "lr": 3e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 20,
        "hidden_layers": 2,
        "hidden_size": 256,  # aumentato a 256
        "dropout": 0.3,
        "rec_dropout": 0.2,
        "l2": 2e-5,
        "window": 8,
        "stride": 2,
        "rnn_type": "GRU",
        "bidirectional": True,
        "cnn_channels": 48,
        "cnn_kernel": 3,
        "cnn_dropout": 0.3,
        "use_attention": False,
    },
    {
        "name": "lstm_medium_large_cnn_no_att",
        "seed": 104,
        "lr": 3e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 20,
        "hidden_layers": 2,
        "hidden_size": 256,  # aumentato a 256
        "dropout": 0.25,
        "rec_dropout": 0.15,
        "l2": 2e-5,
        "window": 8,
        "stride": 2,
        "rnn_type": "LSTM",
        "bidirectional": True,
        "cnn_channels": 48,
        "cnn_kernel": 3,
        "cnn_dropout": 0.35,
        "use_attention": False,
    },

    # 3. CNN + Attention => modello medio
    {
        "name": "gru_medium_cnn_att",
        "seed": 105,
        "lr": 3e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 20,
        "hidden_layers": 2,
        "hidden_size": 192,
        "dropout": 0.25,
        "rec_dropout": 0.15,
        "l2": 2e-5,
        "window": 8,
        "stride": 2,
        "rnn_type": "GRU",
        "bidirectional": True,
        "cnn_channels": 32,
        "cnn_kernel": 3,
        "cnn_dropout": 0.3,
        "use_attention": True,
    },
    {
        "name": "lstm_medium_cnn_att",
        "seed": 106,
        "lr": 3e-4,
        "batch_size": 256,
        "epochs": 100,
        "patience": 20,
        "hidden_layers": 2,
        "hidden_size": 192,
        "dropout": 0.3,
        "rec_dropout": 0.2,
        "l2": 2e-5,
        "window": 8,
        "stride": 2,
        "rnn_type": "LSTM",
        "bidirectional": True,
        "cnn_channels": 32,
        "cnn_kernel": 3,
        "cnn_dropout": 0.35,
        "use_attention": True,
    }
]

label_mapping = {
    'no_pain': 0,
    'low_pain': 1,
    'high_pain': 2
}


In [41]:
# ============================================================
# PHASE 1: 5-FOLD CV WITH SCHEDULER AND VISUALIZATION
# ============================================================

from collections import Counter
from sklearn.model_selection import StratifiedKFold
import numpy as np


config_ensemble_predictions = []
config_mean_epochs = []
config_mean_val_f1 = []
all_cv_submissions = []

for config_idx, config in enumerate(ENSEMBLE_CONFIGS):
    print(f"\n{'='*70}")
    print(f"CONFIG [{config_idx+1}/5]: {config['name']}")
    print(f"{'='*70}")

    fold_predictions = []
    fold_stopped_epochs = []
    fold_val_f1_scores = []

    # Preprocess data once
    DF, _ = preprocess_joints(X_TRAIN.copy())
    X_train_full, _ = dataset_conversion_type_embed_ready(DF)
    y_full = Y_TRAIN.copy()

    sample_indices_unique = y_full['sample_index'].values
    labels_for_split = y_full['label'].map(label_mapping).values

    # 5-Fold CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=config['seed'])

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(sample_indices_unique, labels_for_split)):
        print(f"\n  {'='*60}")
        print(f"  Fold {fold_idx+1}/5 for {config['name']}")
        print(f"  {'='*60}")

        # Set seed
        torch.manual_seed(config['seed'] + fold_idx)
        np.random.seed(config['seed'] + fold_idx)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(config['seed'] + fold_idx)

        # Split data
        train_samples = sample_indices_unique[train_idx]
        val_samples = sample_indices_unique[val_idx]

        X_train = X_train_full[X_train_full['sample_index'].isin(train_samples)]
        y_train = y_full[y_full['sample_index'].isin(train_samples)]

        X_val = X_train_full[X_train_full['sample_index'].isin(val_samples)]
        y_val = y_full[y_full['sample_index'].isin(val_samples)]

        # Normalize
        train_merged = X_train.merge(y_train, on='sample_index')
        train_merged['label'] = train_merged['label'].map(label_mapping)

        val_merged = X_val.merge(y_val, on='sample_index')
        val_merged['label'] = val_merged['label'].map(label_mapping)

        scale_columns = [col for col in train_merged.columns if col.startswith('joint_')]
        mins = train_merged[scale_columns].min()
        maxs = train_merged[scale_columns].max()

        for column in scale_columns:
            train_merged[column] = (train_merged[column] - mins[column]) / (maxs[column] - mins[column])
            val_merged[column] = (val_merged[column] - mins[column]) / (maxs[column] - mins[column])

        # Build sequences
        y_train_df = pd.DataFrame({
            "sample_index": train_merged["sample_index"].unique(),
            "label": train_merged.groupby("sample_index")["label"].first().values
        })

        y_val_df = pd.DataFrame({
            "sample_index": val_merged["sample_index"].unique(),
            "label": val_merged.groupby("sample_index")["label"].first().values
        })

        X_train_seq, y_train_seq = build_sequences(
            train_merged.drop(columns=['label']),
            y_train_df,
            window=config['window'],
            stride=config['stride']
        )

        X_val_seq, y_val_seq = build_sequences(
            val_merged.drop(columns=['label']),
            y_val_df,
            window=config['window'],
            stride=config['stride']
        )

        # Weighted sampler
        labels_np = y_train_seq
        class_counts = np.bincount(labels_np, minlength=3)
        # class_weights_for_sampling = 1.0 / (class_counts + 1e-8)
        # sample_weights = class_weights_for_sampling[labels_np]

                # SQRT dampening
        max_count = class_counts.max()
        class_weights_raw = max_count / class_counts
        # class_weights_dampened = np.sqrt(class_weights_raw)

        # Convert to tensor (CRITICAL!)
        class_weights = torch.tensor(class_weights_raw, dtype=torch.float32)


        # DataLoaders
        train_ds = TensorDataset(
            torch.from_numpy(X_train_seq).float(),
            torch.from_numpy(y_train_seq).long()
        )

        val_ds = TensorDataset(
            torch.from_numpy(X_val_seq).float(),
            torch.from_numpy(y_val_seq).long()
        )

        train_loader = make_loader(train_ds, batch_size=config['batch_size'],
                                   shuffle=False, drop_last=False, sampler=sampler)
        val_loader = make_loader(val_ds, batch_size=config['batch_size'],
                                shuffle=False, drop_last=False)

        # Model
        model = RecurrentClassifier(
            input_size=X_train_seq.shape[2],
            hidden_size=config['hidden_size'],
            num_layers=config['hidden_layers'],
            num_classes=3,
            dropout_rate=config['dropout'],
            rec_dropout_rate=config['rec_dropout'],
            bidirectional=config['bidirectional'],
            cnn_channels=config['cnn_channels'],
            cnn_kernel_size=config['cnn_kernel'],
            cnn_dropout=config['cnn_dropout'],
            rnn_type=config['rnn_type'],
            use_attention=config['use_attention'],
            num_joint_features=29,
            num_pain_features=4,
            num_static_features=3,
            num_time_features=3
        ).to(device)


        # Optimizer & Loss
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=config['l2'])
        train_criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1).to(device)

        val_criterion = nn.CrossEntropyLoss().to(device)
        scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

        # ‚úì SCHEDULER
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            factor=0.7,      # For example
            patience=7,
            min_lr=1e-6      # Prevents LR going too low
        )

        # Train
        model, history = fit(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            epochs=config['epochs'],
            train_criterion=train_criterion,
            val_criterion=val_criterion,
            optimizer=optimizer,
            scaler=scaler,
            device=device,
            patience=config['patience'],
            verbose=10,
            evaluation_metric='val_f1',
            mode='max',
            restore_best_weights=True,
            experiment_name=f"{config['name']}_fold{fold_idx+1}",
            scheduler=scheduler
        )

        # ‚úì VISUALIZE
        plot_training_history(history, config['name'], fold_idx+1)
        val_f1 = evaluate_and_plot_confusion_matrix(
            model, val_loader, X_val,
            config['window'], config['stride'],
            config['name'], fold_idx+1, device
        )

        # Track metrics
        stopped_epoch = len(history['val_f1'])
        fold_stopped_epochs.append(stopped_epoch)
        fold_val_f1_scores.append(val_f1)

        os.makedirs("models", exist_ok=True)
        # Save model after training
        save_path = f"models/{config['name']}_fold{fold_idx+1}.pth"
        torch.save(model.state_dict(), save_path)
        print(f"‚úì Saved model: {save_path}")


        # Predict on test
        X_test = pd.read_csv(DATASET_ROOT / "pirate_pain_test.csv")
        DF_test, _ = preprocess_joints(X_test.copy())
        X_test, _ = dataset_conversion_type_embed_ready(DF_test)

        for column in scale_columns:
            X_test[column] = (X_test[column] - mins[column]) / (maxs[column] - mins[column])

        X_test_seq, _ = build_sequences(X_test, None, window=config['window'], stride=config['stride'])

        model.eval()
        with torch.no_grad():
            logits = []
            for i_batch in range(0, len(X_test_seq), 256):
                xb = torch.from_numpy(X_test_seq[i_batch:i_batch+256]).float().to(device)
                logits.append(model(xb).cpu().numpy())
            logits = np.concatenate(logits, axis=0)

        # Aggregate
        n_windows_per_seq = (160 - config['window']) // config['stride'] + 1
        sample_indices = sorted(X_test['sample_index'].unique())

        sequence_preds = []
        for idx in range(len(sample_indices)):
            start_idx = idx * n_windows_per_seq
            end_idx = start_idx + n_windows_per_seq

            window_preds = logits[start_idx:end_idx].argmax(axis=1)
            vote_counts = Counter(window_preds)
            final_pred = vote_counts.most_common(1)[0][0]
            sequence_preds.append(final_pred)

        fold_predictions.append(np.array(sequence_preds))

    # Config summary
    mean_epoch = int(np.mean(fold_stopped_epochs))
    mean_f1 = np.mean(fold_val_f1_scores)
    config_mean_epochs.append(mean_epoch)
    config_mean_val_f1.append(mean_f1)
    print(f"\n  ‚úì {config['name']} SUMMARY: mean_epoch={mean_epoch}, mean_val_f1={mean_f1:.4f}")

    # Level 1 ensemble
    stacked_folds = np.stack(fold_predictions, axis=0)
    config_ensemble_preds = []
    for i in range(stacked_folds.shape[1]):
        votes = stacked_folds[:, i]
        vote_counts = Counter(votes)
        config_ensemble_preds.append(vote_counts.most_common(1)[0][0])

    config_ensemble_predictions.append(np.array(config_ensemble_preds))

    # Save
    idx2label = {0: 'no_pain', 1: 'low_pain', 2: 'high_pain'}
    pred_labels = [idx2label[int(p)] for p in config_ensemble_preds]

    submission = pd.DataFrame({
        'sample_index': [str(sid).zfill(3) for sid in sample_indices],
        'label': pred_labels
    })

    cv_submission_filename = f'{config["name"]}_cv_ensemble.csv'
    submission.to_csv(cv_submission_filename, index=False)
    all_cv_submissions.append(cv_submission_filename)

print("\n‚úì Phase 1 complete")



CONFIG [1/5]: gru_big_no_cnn_no_att
[preprocess_joints] start=31 | kept=29 | dropped=2
  ‚Ä¢ dropped: ['joint_11', 'joint_30']

  Fold 1/5 for gru_big_no_cnn_no_att
Built 40656 sequences; each shape = (8, 39)
Built 10241 sequences; each shape = (8, 39)


NameError: name 'class_sample_counts' is not defined

In [None]:
# ============================================================
# PHASE 2: CV META-ENSEMBLES
# ============================================================

print(f"\n{'='*70}")
print("PHASE 2: CV META-ENSEMBLES")
print(f"{'='*70}")

sorted_indices = np.argsort(config_mean_val_f1)[::-1]
print("\nConfig ranking by mean val_f1:")
for rank, idx in enumerate(sorted_indices, 1):
    print(f"  {rank}. {ENSEMBLE_CONFIGS[idx]['name']}: {config_mean_val_f1[idx]:.4f}")

def create_cv_ensemble(indices, name_suffix, predictions_source):
    """Create ensemble from CV predictions"""
    selected_preds = [predictions_source[i] for i in indices]
    stacked = np.stack(selected_preds, axis=0)

    ensemble_preds = []
    for i in range(stacked.shape[1]):
        votes = stacked[:, i]
        vote_counts = Counter(votes)
        ensemble_preds.append(vote_counts.most_common(1)[0][0])

    idx2label = {0: 'no_pain', 1: 'low_pain', 2: 'high_pain'}
    pred_labels = [idx2label[int(p)] for p in ensemble_preds]

    submission = pd.DataFrame({
        'sample_index': [str(sid).zfill(3) for sid in sample_indices],
        'label': pred_labels
    })

    filename = f'cv_ensemble_{name_suffix}.csv'
    submission.to_csv(filename, index=False)
    print(f"‚úì Saved: {filename}")
    return filename

# Assume you have 6 models: config_ensemble_predictions is length 6
cv_ensemble_files = []
cv_ensemble_files.append(create_cv_ensemble(sorted_indices[:3], "top3", config_ensemble_predictions))
cv_ensemble_files.append(create_cv_ensemble(sorted_indices[:4], "top4", config_ensemble_predictions))
cv_ensemble_files.append(create_cv_ensemble(sorted_indices[:5], "top5", config_ensemble_predictions))
cv_ensemble_files.append(create_cv_ensemble(sorted_indices[:6], "all6", config_ensemble_predictions))




In [None]:
# Example load function for later use
def load_model(config, fold, device):
    model = RecurrentClassifier(
        input_size=YOUR_INPUT_SIZE,  # set as used in training
        hidden_size=config['hidden_size'],
        num_layers=config['hidden_layers'],
        num_classes=3,
        dropout_rate=config['dropout'],
        rec_dropout_rate=config['rec_dropout'],
        bidirectional=config['bidirectional'],
        cnn_channels=config['cnn_channels'],
        cnn_kernel_size=config['cnn_kernel'],
        cnn_dropout=config['cnn_dropout'],
        rnn_type=config['rnn_type'],
        use_attention=config.get('use_attention', True),
        num_joint_features=29,
        num_pain_features=4,
        num_static_features=3,
        num_time_features=3
    ).to(device)

    checkpoint_path = f"models/{config['name']}_fold{fold}.pth"
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.eval()
    return model
