In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import re
import warnings

# Ignora avvisi non critici
warnings.filterwarnings('ignore')

# --- Parametri Globali ---
JOINT_COLS = [f'joint_{i:02d}' for i in range(30)]
SURVEY_COLS = ['pain_survey_1', 'pain_survey_2', 'pain_survey_3', 'pain_survey_4']
STATIC_COLS = ['n_legs', 'n_hands', 'n_eyes'] 
TIME_COL = 'time'

WINDOW_SIZE = 40
STRIDE = 10

# --- Parametri Training ---
SEED = 42
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
EPOCHS = 50 
GRADIENT_CLIP_VALUE = 1.0 
K_FOLDS = 5 

# Setup Riproducibilità
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device in uso: {device}")

Device in uso: cuda


In [2]:
print("--- 1. Caricamento e Pulizia Iniziale ---")

df_features_raw = pd.read_csv('/kaggle/input/pirate/pirate_pain_train.csv')
df_labels_raw = pd.read_csv('/kaggle/input/pirate/pirate_pain_train_labels.csv')
df_test_raw = pd.read_csv('/kaggle/input/pirate/pirate_pain_test.csv')

# 1. FIX: Forziamo le colonne statiche a essere Interi (per evitare che Pandas le legga come stringhe)
for col in STATIC_COLS:
    df_features_raw[col] = pd.to_numeric(df_features_raw[col], errors='coerce').fillna(0).astype(int)
    df_test_raw[col] = pd.to_numeric(df_test_raw[col], errors='coerce').fillna(0).astype(int)

# 2. COMMUNICATION HINT: Cerca una VERA colonna di testo (Team Name)
exclude_cols = ['label', 'sample_index']
string_cols = df_features_raw.select_dtypes(include=['object']).columns.tolist()
string_cols = [c for c in string_cols if c not in exclude_cols]

TEXT_COL = None
TEXT_VOCAB_SIZE = 0

if len(string_cols) > 0:
    TEXT_COL = string_cols[0] 
    print(f"Trovata colonna 'Team Name': {TEXT_COL}")
    
    def clean_team_name(text):
        if pd.isna(text): return "unknown"
        return re.sub(r'[^a-z0-9]', '', str(text).lower())

    df_features_raw[TEXT_COL] = df_features_raw[TEXT_COL].apply(clean_team_name)
    df_test_raw[TEXT_COL] = df_test_raw[TEXT_COL].apply(clean_team_name)
    
    le_text = LabelEncoder()
    all_text = pd.concat([df_features_raw[TEXT_COL], df_test_raw[TEXT_COL]], axis=0)
    le_text.fit(all_text)
    
    df_features_raw[TEXT_COL] = le_text.transform(df_features_raw[TEXT_COL])
    df_test_raw[TEXT_COL] = le_text.transform(df_test_raw[TEXT_COL])
    
    TEXT_VOCAB_SIZE = len(le_text.classes_)
    print(f"Vocabolario Team Name: {TEXT_VOCAB_SIZE} squadre uniche.")
else:
    print("Nessuna colonna 'Team Name' trovata (a parte le feature numeriche).")

# 3. Feature Engineering Delta
def engineer_features(df):
    df_eng = df.copy()
    grouped = df_eng.groupby('sample_index')
    for col in JOINT_COLS:
        df_eng[f'd_{col}'] = grouped[col].diff().fillna(0)
    
    # --- MODIFICA QUI (Advice 12/11) ---
    # Aggiungiamo Seno e Coseno per catturare la ciclicità del tempo
    # Assumiamo un ciclo massimo di circa 160 step (lunghezza tipica sequenza)
    max_time_val = df_eng[TIME_COL].max() + 1 
    df_eng['sin_time'] = np.sin(2 * np.pi * df_eng[TIME_COL] / max_time_val)
    df_eng['cos_time'] = np.cos(2 * np.pi * df_eng[TIME_COL] / max_time_val)
    # ------------------------------------

    if 'joint_30' in df_eng.columns:
        df_eng = df_eng.drop(columns=['joint_30'])
    return df_eng

print("Calcolo Delta Features...")
df_features_engineered = engineer_features(df_features_raw)
df_test_engineered = engineer_features(df_test_raw)

DELTA_JOINT_COLS = [f'd_{col}' for col in JOINT_COLS]
CONTINUOUS_COLS = JOINT_COLS + DELTA_JOINT_COLS + ['sin_time', 'cos_time']

# --- Preparazione Vocabolari per Embedding (CORREZIONE QUI) ---
# Usiamo int(...) invece di .astype(int)
survey_vocab_sizes = [int(df_features_engineered[c].max() + 1) for c in SURVEY_COLS]
time_vocab_size = int(df_features_engineered[TIME_COL].max() + 1)
static_vocab_sizes = [int(df_features_engineered[c].max() + 1) for c in STATIC_COLS]

print("Preprocessing Completato.")

--- 1. Caricamento e Pulizia Iniziale ---
Nessuna colonna 'Team Name' trovata (a parte le feature numeriche).
Calcolo Delta Features...
Preprocessing Completato.


In [3]:
# Mappatura Label
label_mapping = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
df_labels_raw['label_encoded'] = df_labels_raw['label'].map(label_mapping)

class PiratePainDataset(Dataset):
    def __init__(self, features_df, labels_df, sample_indices, window_size, stride, text_col=None, augment=False):
        self.features_df = features_df
        # Se labels_df è None, siamo in fase di test
        self.labels_df = labels_df.set_index('sample_index') if labels_df is not None else None
        self.sample_indices = sample_indices
        self.window_size = window_size
        self.stride = stride
        self.text_col = text_col
        
        # Raggruppamento per accesso veloce
        self.grouped_features = dict(tuple(features_df.groupby('sample_index')))
        self.indices = self._create_indices()

        self.augment = augment # Salva il parametro

    def _create_indices(self):
        indices = []
        for sample_idx in self.sample_indices:
            if sample_idx not in self.grouped_features: continue
            data = self.grouped_features[sample_idx]
            n_timesteps = len(data)
            for start in range(0, n_timesteps - self.window_size + 1, self.stride):
                indices.append((sample_idx, start, start + self.window_size))
        return indices

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        sample_idx, start, end = self.indices[idx]
        window_data = self.grouped_features[sample_idx].iloc[start:end]

        # 1. Continui (Modifica qui!)
        vals = window_data[CONTINUOUS_COLS].values
        
        # --- INIZIO AUGMENTATION ---
        if self.augment:
            # Aggiunge rumore casuale (Gaussian Noise)
            noise = np.random.normal(0, 0.02, vals.shape) 
            vals = vals + noise
        # --- FINE AUGMENTATION ---

        # 1. Continui
        x_cont = torch.tensor(window_data[CONTINUOUS_COLS].values, dtype=torch.float)
        # 2. Survey (+1 sicurezza)
        x_survey = torch.tensor((window_data[SURVEY_COLS].values + 1), dtype=torch.long)
        # 3. Time (+1 sicurezza)
        x_time = torch.tensor((window_data[TIME_COL].values + 1), dtype=torch.long)
        # 4. Static (Legs, Hands, Eyes) - Prendi la prima riga (sono costanti)
        x_static = torch.tensor((window_data[STATIC_COLS].iloc[0].values + 1), dtype=torch.long)
        
        # 5. Text (Opzionale)
        x_text = torch.tensor(0, dtype=torch.long)
        if self.text_col:
            val = window_data[self.text_col].iloc[0]
            x_text = torch.tensor(val, dtype=torch.long)

        label = torch.tensor(-1, dtype=torch.long)
        if self.labels_df is not None:
            label = torch.tensor(self.labels_df.loc[sample_idx, 'label_encoded'], dtype=torch.long)

        return x_cont, x_survey, x_time, x_static, x_text, label

# --- Weighted Sampler (Per Advice 08/11 Advanced) ---
def get_weighted_sampler(dataset, labels_df):
    # Mappa sample -> label
    sample_to_label = labels_df.set_index('sample_index')['label_encoded'].to_dict()
    # Calcola frequenza inversa classi
    label_counts = labels_df['label_encoded'].value_counts().sort_index()
    class_weights = 1.0 / label_counts
    
    # Assegna peso a ogni finestra nel dataset
    weights = []
    for idx_tuple in dataset.indices:
        s_idx = idx_tuple[0]
        if s_idx in sample_to_label:
            l = sample_to_label[s_idx]
            weights.append(class_weights[l])
        else:
            weights.append(0) # Non dovrebbe accadere in train
            
    return WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

In [4]:
# --- Focal Loss Custom ---
class FocalLoss(nn.Module):
    # Aggiungiamo il parametro label_smoothing (default 0.0, consigliato 0.1)
    def __init__(self, alpha=None, gamma=2.0, reduction='mean', label_smoothing=0.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.label_smoothing = label_smoothing # Salva il valore

    def forward(self, inputs, targets):
        # QUI applichiamo l'Advice 09/11: label_smoothing=self.label_smoothing
        ce_loss = F.cross_entropy(
            inputs, 
            targets, 
            reduction='none', 
            weight=self.alpha, 
            label_smoothing=self.label_smoothing  # <--- ECCOLO!
        )
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean() if self.reduction == 'mean' else focal_loss.sum()

# --- Modello Completo ---
import math

# --- Modulo Positional Encoding (Necessario per i Transformer) ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [Batch, Seq_len, Emebedding_dim]
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

# --- IL MODELLO "CIOTTO": Transformer Encoder ---
class PirateTransformerModel(nn.Module):
    def __init__(self, n_continuous, survey_vocab_sizes, time_vocab_size, 
                 static_vocab_sizes, text_vocab_size, 
                 d_model=128, nhead=4, num_layers=3, n_classes=3, dropout=0.3):
        super().__init__()
        
        # --- 1. EMBEDDINGS (Uguale a prima) ---
        self.emb_surveys = nn.ModuleList([nn.Embedding(v+2, 4) for v in survey_vocab_sizes])
        self.emb_time = nn.Embedding(time_vocab_size+2, 8)
        self.emb_static = nn.ModuleList([nn.Embedding(v+2, 4) for v in static_vocab_sizes])
        
        self.use_text = (text_vocab_size > 0)
        text_dim = 8 if self.use_text else 0
        if self.use_text:
            self.emb_text = nn.Embedding(text_vocab_size+2, text_dim)
            
        # Calcolo dimensioni input
        total_survey_dim = len(survey_vocab_sizes) * 4
        total_static_dim = len(static_vocab_sizes) * 4
        input_dim = n_continuous + total_survey_dim + 8 + total_static_dim + text_dim
        
        # Proiezione lineare per portare tutto a d_model (es. 128)
        # Questo serve perché il Transformer vuole una dimensione fissa per le "teste"
        self.input_projection = nn.Linear(input_dim, d_model)
        
        # --- 2. POSITIONAL ENCODING ---
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # --- 3. TRANSFORMER ENCODER BLOCK ---
        # "batch_first=True" è cruciale per come abbiamo i dati
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, 
                                                    dim_feedforward=d_model*4, 
                                                    dropout=dropout, 
                                                    batch_first=True,
                                                    activation='gelu') # GELU è meglio di RELU
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # --- 4. CLASSIFICATION HEAD ---
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, n_classes)
        )

    def forward(self, x_cont, x_survey, x_time, x_static, x_text):
        # (Stessa logica di concatenazione della LSTM)
        batch_size, seq_len, _ = x_cont.shape
        
        e_surv = [emb(x_survey[:,:,i]) for i, emb in enumerate(self.emb_surveys)]
        e_time = self.emb_time(x_time)
        
        e_stat = [emb(x_static[:,i]) for i, emb in enumerate(self.emb_static)]
        e_stat_cat = torch.cat(e_stat, dim=1)
        if self.use_text:
            e_txt = self.emb_text(x_text)
            e_stat_cat = torch.cat([e_stat_cat, e_txt], dim=1)
        e_stat_seq = e_stat_cat.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Concatena tutto
        full_input = torch.cat([x_cont] + e_surv + [e_time, e_stat_seq], dim=2)
        
        # --- DIFFERENZA KEY: Proiezione + Transformer ---
        
        # 1. Proietta a d_model (es. 68 features -> 128 dimensioni)
        x = self.input_projection(full_input) 
        
        # 2. Aggiungi info posizionale (il Transformer non sa l'ordine senza questo)
        x = self.pos_encoder(x)
        
        # 3. Passa nel Transformer (Self-Attention)
        # Output: [Batch, Seq, d_model]
        x = self.transformer_encoder(x)
        
        # 4. Pooling Strategy (Attention Pooling o Mean Pooling)
        # Qui usiamo Mean Pooling sull'asse temporale (media di tutti gli step)
        # Alternativa: Prendere solo l'ultimo step x[:, -1, :]
        x_pooled = x.mean(dim=1) 
        
        # 5. Classifica
        logits = self.classifier(x_pooled)
        
        return logits

In [5]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for xc, xs, xt, xst, xtxt, y in loader:
        xc, xs, xt, xst, xtxt, y = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device), y.to(device)
        
        optimizer.zero_grad()
        logits = model(xc, xs, xt, xst, xtxt)
        loss = criterion(logits, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_VALUE)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    preds, labels = [], []
    with torch.no_grad():
        for xc, xs, xt, xst, xtxt, y in loader:
            xc, xs, xt, xst, xtxt, y = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device), y.to(device)
            
            logits = model(xc, xs, xt, xst, xtxt)
            loss = criterion(logits, y)
            total_loss += loss.item()
            
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels.extend(y.cpu().numpy())
            
    f1 = f1_score(labels, preds, average='weighted')
    return total_loss / len(loader), f1

In [6]:
# --- FIX: Ripristina le definizioni mancanti ---
all_sample_indices = df_labels_raw['sample_index'].unique()
all_labels_strat = df_labels_raw.set_index('sample_index').loc[all_sample_indices]['label_encoded'].values

# --- SOSTITUISCI TUTTO DA QUI IN POI ---

print("--- Avvio K-Fold con Ensemble Strategy ---")

# Setup per OOF e Ensemble
oof_probs = np.zeros((len(all_sample_indices), 3)) # Matrice (N_Samples, 3_Classi)
oof_targets = np.zeros(len(all_sample_indices))
models_list = [] # Qui salveremo i 5 modelli addestrati

# Mappa SampleIndex -> Posizione nell'array (0, 1, 2...)
sample_to_idx = {s: i for i, s in enumerate(all_sample_indices)}

skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(skf.split(all_sample_indices, all_labels_strat)):
    print(f"\n--- Fold {fold+1}/{K_FOLDS} ---")
    
    train_samples = all_sample_indices[train_idx]
    val_samples = all_sample_indices[val_idx]
    
    # 1. Scaling (Standard su Train, applicato a entrambi)
    scaler = StandardScaler()
    train_subset = df_features_engineered[df_features_engineered['sample_index'].isin(train_samples)]
    scaler.fit(train_subset[CONTINUOUS_COLS])
    
    df_fold = df_features_engineered.copy()
    df_fold[CONTINUOUS_COLS] = scaler.transform(df_fold[CONTINUOUS_COLS])
    
    # 2. Dataset & Loader
    train_ds = PiratePainDataset(df_fold, df_labels_raw, train_samples, WINDOW_SIZE, STRIDE, TEXT_COL, augment=True)
    val_ds = PiratePainDataset(df_fold, df_labels_raw, val_samples, WINDOW_SIZE, STRIDE, TEXT_COL, augment=False)
    
    sampler = get_weighted_sampler(train_ds, df_labels_raw)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler, shuffle=False, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    
    # 3. Modello e Ottimizzatore
    # Modello Transformer (Più parametri, più lento, più potente)
    model = PirateTransformerModel(
        n_continuous=len(CONTINUOUS_COLS), 
        survey_vocab_sizes=survey_vocab_sizes, 
        time_vocab_size=time_vocab_size,
        static_vocab_sizes=static_vocab_sizes, 
        text_vocab_size=TEXT_VOCAB_SIZE, 
        d_model=128,   # Dimensione interna
        nhead=4,       # Numero di teste di attenzione (deve essere divisore di d_model)
        num_layers=3,  # Numero di blocchi transformer (più sono, più è "ciotto")
        dropout=0.3    # Alto dropout per evitare overfitting sui pochi dati
    ).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    criterion = FocalLoss(alpha=None, gamma=2.0, label_smoothing=0.1)
    
    # 4. Training Loop con Salvataggio Best Model
    best_v_f1 = 0
    best_model_wts = None
    
    for ep in range(EPOCHS):
        # Train
        model.train()
        for xc, xs, xt, xst, xtxt, y in train_loader:
            xc, xs, xt, xst, xtxt, y = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(xc, xs, xt, xst, xtxt)
            loss = criterion(out, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_VALUE)
            optimizer.step()
            
        # Validation (Soft Voting per Sample)
        model.eval()
        val_logits_list = []
        val_sample_indices_list = []
        val_labels_list = [] # Solo per controllo
        
        # Mappa inversa per sapere quale finestra appartiene a quale sample
        # Nota: PiratePainDataset non restituisce sample_idx nel __getitem__, dobbiamo ricostruirlo o fidarci dell'ordine
        # TRUCCO: Usiamo l'ordine del dataset.indices che è deterministico nel val_loader (shuffle=False)
        window_sample_map_val = [x[0] for x in val_ds.indices]
        
        with torch.no_grad():
            batch_start = 0
            for xc, xs, xt, xst, xtxt, y in val_loader:
                xc, xs, xt, xst, xtxt = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device)
                logits = model(xc, xs, xt, xst, xtxt)
                val_logits_list.extend(logits.cpu().numpy())
                
        # Aggregazione per Sample (Soft Voting)
        df_val_logits = pd.DataFrame(val_logits_list, columns=[0, 1, 2])
        df_val_logits['sample_index'] = window_sample_map_val
        df_val_probs = df_val_logits.groupby('sample_index').mean() # Media Logits
        
        # Calcolo F1 corrente (usando Argmax provvisorio)
        # Dobbiamo allineare le predizioni con le label vere
        current_val_probs = torch.softmax(torch.tensor(df_val_probs.values), dim=1).numpy()
        current_val_preds = np.argmax(current_val_probs, axis=1)
        
        # Recuperiamo le label vere ordinate come nel df_val_probs
        current_val_indices = df_val_probs.index
        current_val_labels = df_labels_raw.set_index('sample_index').loc[current_val_indices]['label_encoded'].values
        
        v_f1 = f1_score(current_val_labels, current_val_preds, average='weighted')
        
        if v_f1 > best_v_f1:
            best_v_f1 = v_f1
            best_model_wts = model.state_dict()
            # Salviamo le probabilità vincenti nell'array OOF globale
            for idx, s_idx in enumerate(current_val_indices):
                global_idx = sample_to_idx[s_idx]
                oof_probs[global_idx] = current_val_probs[idx]
                oof_targets[global_idx] = current_val_labels[idx]

    print(f"Fold {fold+1} Best Val F1: {best_v_f1:.4f}")
    
    # Ricarica il miglior modello e salvalo nella lista ensemble
    model.load_state_dict(best_model_wts)
    models_list.append(model)

# --- OTTIMIZZAZIONE SOGLIE (Threshold Optimization) ---
print("\n--- Ricerca Soglie Ottimali su OOF ---")
best_thresh = (0.0, 0.0)
best_score = 0.0

# Grid Search fine
for t_high in np.arange(0.15, 0.50, 0.01):
    for t_low in np.arange(0.20, 0.55, 0.01):
        if t_low >= t_high: continue # Logica: High ha priorità
        
        preds = []
        for p in oof_probs:
            if p[2] > t_high: preds.append(2)
            elif p[1] > t_low: preds.append(1)
            else: preds.append(0)
            
        s = f1_score(oof_targets, preds, average='weighted')
        if s > best_score:
            best_score = s
            best_thresh = (t_low, t_high)

print(f"Soglie Trovate: Low>{best_thresh[0]:.2f}, High>{best_thresh[1]:.2f} -> OOF F1: {best_score:.4f}")


# --- INFERENZA ENSEMBLE (Test Set) ---
print("\n--- Generazione Submission (Ensemble 5 Models) ---")

# Preparazione Test Set (Scaling) - Nota: Usiamo scaler fittato sull'ultimo fold o rifittiamo su tutto?
# Meglio: Per ogni modello del fold, dovremmo usare il suo scaler. 
# Ma per semplicità (e dato che StandardScaler è robusto), usiamo uno scaler fittato su tutto il train.
final_scaler = StandardScaler()
final_scaler.fit(df_features_engineered[CONTINUOUS_COLS])
df_test_scaled = df_test_engineered.copy()
df_test_scaled[CONTINUOUS_COLS] = final_scaler.transform(df_test_scaled[CONTINUOUS_COLS])

# Dataset Test
sub_indices = pd.read_csv('/kaggle/input/pirate/sample_submission.csv')['sample_index'].unique()
test_ds_final = PiratePainDataset(df_test_scaled, None, sub_indices, WINDOW_SIZE, STRIDE, TEXT_COL, augment=False)
test_loader_final = DataLoader(test_ds_final, batch_size=BATCH_SIZE*2, shuffle=False)
window_sample_map_test = [x[0] for x in test_ds_final.indices]

# Accumulatore Probabilità
ensemble_logits = None

for i, model in enumerate(models_list):
    model.eval()
    fold_logits = []
    with torch.no_grad():
        for xc, xs, xt, xst, xtxt, _ in test_loader_final:
            xc, xs, xt, xst, xtxt = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device)
            logits = model(xc, xs, xt, xst, xtxt)
            fold_logits.extend(logits.cpu().numpy())
    
    # Aggrega per Sample (Media Logits del modello i)
    df_tmp = pd.DataFrame(fold_logits, columns=[0, 1, 2])
    df_tmp['sample_index'] = window_sample_map_test
    df_avg = df_tmp.groupby('sample_index').mean() # (N_Test_Samples, 3)
    
    # Somma ai logits totali (allineando per indice)
    if ensemble_logits is None:
        ensemble_logits = df_avg
    else:
        ensemble_logits = ensemble_logits.add(df_avg, fill_value=0)

# Media finale dei 5 modelli
ensemble_logits /= K_FOLDS

# Converti in Probabilità
final_probs = torch.softmax(torch.tensor(ensemble_logits.values), dim=1).numpy()

# Applica Soglie Ottimizzate
final_preds_list = []
thr_l, thr_h = best_thresh

for p in final_probs:
    if p[2] > thr_h: final_preds_list.append(2)
    elif p[1] > thr_l: final_preds_list.append(1)
    else: final_preds_list.append(0)

final_series = pd.Series(final_preds_list, index=ensemble_logits.index)

# Salvataggio
inv_map = {v: k for k, v in label_mapping.items()}
submission = final_series.map(inv_map).reset_index()
submission.columns = ['sample_index', 'label']

sample_sub = pd.read_csv('/kaggle/input/pirate/sample_submission.csv')
submission = submission.set_index('sample_index').reindex(sample_sub['sample_index']).reset_index()
submission.to_csv('submission.csv', index=False)
print("Fatto! Ensemble Submission creata.")

--- Avvio K-Fold con Ensemble Strategy ---

--- Fold 1/5 ---
Fold 1 Best Val F1: 0.9507

--- Fold 2/5 ---
Fold 2 Best Val F1: 0.9543

--- Fold 3/5 ---
Fold 3 Best Val F1: 0.9404

--- Fold 4/5 ---
Fold 4 Best Val F1: 0.9393

--- Fold 5/5 ---
Fold 5 Best Val F1: 0.9455

--- Ricerca Soglie Ottimali su OOF ---
Soglie Trovate: Low>0.43, High>0.46 -> OOF F1: 0.9488

--- Generazione Submission (Ensemble 5 Models) ---
Fatto! Ensemble Submission creata.
