In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import re
import warnings

# Ignora avvisi non critici
warnings.filterwarnings('ignore')

# --- Parametri Globali ---
JOINT_COLS = [f'joint_{i:02d}' for i in range(30)]
SURVEY_COLS = ['pain_survey_1', 'pain_survey_2', 'pain_survey_3', 'pain_survey_4']
STATIC_COLS = ['n_legs', 'n_hands', 'n_eyes'] 
TIME_COL = 'time'

WINDOW_SIZE = 40
STRIDE = 10

# --- Parametri Training ---
SEED = 42
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
EPOCHS = 50 
GRADIENT_CLIP_VALUE = 1.0 
K_FOLDS = 5 

# Setup Riproducibilità
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device in uso: {device}")

Device in uso: cuda


In [2]:
print("--- 1. Caricamento e Pulizia Iniziale ---")

df_features_raw = pd.read_csv('/kaggle/input/pirate/pirate_pain_train.csv')
df_labels_raw = pd.read_csv('/kaggle/input/pirate/pirate_pain_train_labels.csv')
df_test_raw = pd.read_csv('/kaggle/input/pirate/pirate_pain_test.csv')

# 1. FIX: Forziamo le colonne statiche a essere Interi (per evitare che Pandas le legga come stringhe)
for col in STATIC_COLS:
    df_features_raw[col] = pd.to_numeric(df_features_raw[col], errors='coerce').fillna(0).astype(int)
    df_test_raw[col] = pd.to_numeric(df_test_raw[col], errors='coerce').fillna(0).astype(int)

# 2. COMMUNICATION HINT: Cerca una VERA colonna di testo (Team Name)
exclude_cols = ['label', 'sample_index']
string_cols = df_features_raw.select_dtypes(include=['object']).columns.tolist()
string_cols = [c for c in string_cols if c not in exclude_cols]

TEXT_COL = None
TEXT_VOCAB_SIZE = 0

if len(string_cols) > 0:
    TEXT_COL = string_cols[0] 
    print(f"Trovata colonna 'Team Name': {TEXT_COL}")
    
    def clean_team_name(text):
        if pd.isna(text): return "unknown"
        return re.sub(r'[^a-z0-9]', '', str(text).lower())

    df_features_raw[TEXT_COL] = df_features_raw[TEXT_COL].apply(clean_team_name)
    df_test_raw[TEXT_COL] = df_test_raw[TEXT_COL].apply(clean_team_name)
    
    le_text = LabelEncoder()
    all_text = pd.concat([df_features_raw[TEXT_COL], df_test_raw[TEXT_COL]], axis=0)
    le_text.fit(all_text)
    
    df_features_raw[TEXT_COL] = le_text.transform(df_features_raw[TEXT_COL])
    df_test_raw[TEXT_COL] = le_text.transform(df_test_raw[TEXT_COL])
    
    TEXT_VOCAB_SIZE = len(le_text.classes_)
    print(f"Vocabolario Team Name: {TEXT_VOCAB_SIZE} squadre uniche.")
else:
    print("Nessuna colonna 'Team Name' trovata (a parte le feature numeriche).")

# 3. Feature Engineering Delta
def engineer_features(df):
    df_eng = df.copy()
    grouped = df_eng.groupby('sample_index')
    for col in JOINT_COLS:
        df_eng[f'd_{col}'] = grouped[col].diff().fillna(0)
    
    if 'joint_30' in df_eng.columns:
        df_eng = df_eng.drop(columns=['joint_30'])
    return df_eng

print("Calcolo Delta Features...")
df_features_engineered = engineer_features(df_features_raw)
df_test_engineered = engineer_features(df_test_raw)

DELTA_JOINT_COLS = [f'd_{col}' for col in JOINT_COLS]
CONTINUOUS_COLS = JOINT_COLS + DELTA_JOINT_COLS

# --- Preparazione Vocabolari per Embedding (CORREZIONE QUI) ---
# Usiamo int(...) invece di .astype(int)
survey_vocab_sizes = [int(df_features_engineered[c].max() + 1) for c in SURVEY_COLS]
time_vocab_size = int(df_features_engineered[TIME_COL].max() + 1)
static_vocab_sizes = [int(df_features_engineered[c].max() + 1) for c in STATIC_COLS]

print("Preprocessing Completato.")

--- 1. Caricamento e Pulizia Iniziale ---
Nessuna colonna 'Team Name' trovata (a parte le feature numeriche).
Calcolo Delta Features...
Preprocessing Completato.


In [3]:
# Mappatura Label
label_mapping = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
df_labels_raw['label_encoded'] = df_labels_raw['label'].map(label_mapping)

class PiratePainDataset(Dataset):
    def __init__(self, features_df, labels_df, sample_indices, window_size, stride, text_col=None, augment=False):
        self.features_df = features_df
        # Se labels_df è None, siamo in fase di test
        self.labels_df = labels_df.set_index('sample_index') if labels_df is not None else None
        self.sample_indices = sample_indices
        self.window_size = window_size
        self.stride = stride
        self.text_col = text_col
        
        # Raggruppamento per accesso veloce
        self.grouped_features = dict(tuple(features_df.groupby('sample_index')))
        self.indices = self._create_indices()

        self.augment = augment # Salva il parametro

    def _create_indices(self):
        indices = []
        for sample_idx in self.sample_indices:
            if sample_idx not in self.grouped_features: continue
            data = self.grouped_features[sample_idx]
            n_timesteps = len(data)
            for start in range(0, n_timesteps - self.window_size + 1, self.stride):
                indices.append((sample_idx, start, start + self.window_size))
        return indices

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        sample_idx, start, end = self.indices[idx]
        window_data = self.grouped_features[sample_idx].iloc[start:end]

        # 1. Continui (Modifica qui!)
        vals = window_data[CONTINUOUS_COLS].values
        
        # --- INIZIO AUGMENTATION ---
        if self.augment:
            # Aggiunge rumore casuale (Gaussian Noise)
            noise = np.random.normal(0, 0.02, vals.shape) 
            vals = vals + noise
        # --- FINE AUGMENTATION ---

        # 1. Continui
        x_cont = torch.tensor(window_data[CONTINUOUS_COLS].values, dtype=torch.float)
        # 2. Survey (+1 sicurezza)
        x_survey = torch.tensor((window_data[SURVEY_COLS].values + 1), dtype=torch.long)
        # 3. Time (+1 sicurezza)
        x_time = torch.tensor((window_data[TIME_COL].values + 1), dtype=torch.long)
        # 4. Static (Legs, Hands, Eyes) - Prendi la prima riga (sono costanti)
        x_static = torch.tensor((window_data[STATIC_COLS].iloc[0].values + 1), dtype=torch.long)
        
        # 5. Text (Opzionale)
        x_text = torch.tensor(0, dtype=torch.long)
        if self.text_col:
            val = window_data[self.text_col].iloc[0]
            x_text = torch.tensor(val, dtype=torch.long)

        label = torch.tensor(-1, dtype=torch.long)
        if self.labels_df is not None:
            label = torch.tensor(self.labels_df.loc[sample_idx, 'label_encoded'], dtype=torch.long)

        return x_cont, x_survey, x_time, x_static, x_text, label

# --- Weighted Sampler (Per Advice 08/11 Advanced) ---
def get_weighted_sampler(dataset, labels_df):
    # Mappa sample -> label
    sample_to_label = labels_df.set_index('sample_index')['label_encoded'].to_dict()
    # Calcola frequenza inversa classi
    label_counts = labels_df['label_encoded'].value_counts().sort_index()
    class_weights = 1.0 / label_counts
    
    # Assegna peso a ogni finestra nel dataset
    weights = []
    for idx_tuple in dataset.indices:
        s_idx = idx_tuple[0]
        if s_idx in sample_to_label:
            l = sample_to_label[s_idx]
            weights.append(class_weights[l])
        else:
            weights.append(0) # Non dovrebbe accadere in train
            
    return WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

In [4]:
# --- Focal Loss Custom ---
class FocalLoss(nn.Module):
    # Aggiungiamo il parametro label_smoothing (default 0.0, consigliato 0.1)
    def __init__(self, alpha=None, gamma=2.0, reduction='mean', label_smoothing=0.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.label_smoothing = label_smoothing # Salva il valore

    def forward(self, inputs, targets):
        # QUI applichiamo l'Advice 09/11: label_smoothing=self.label_smoothing
        ce_loss = F.cross_entropy(
            inputs, 
            targets, 
            reduction='none', 
            weight=self.alpha, 
            label_smoothing=self.label_smoothing  # <--- ECCOLO!
        )
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean() if self.reduction == 'mean' else focal_loss.sum()

# --- Modello Completo ---
class PiratePainModel(nn.Module):
    def __init__(self, n_continuous, survey_vocab_sizes, time_vocab_size, 
                 static_vocab_sizes, text_vocab_size, lstm_hidden=128, n_classes=3):
        super().__init__()
        
        # Embedding Survey (4 features)
        self.emb_surveys = nn.ModuleList([nn.Embedding(v+2, 4) for v in survey_vocab_sizes])
        # Embedding Time
        self.emb_time = nn.Embedding(time_vocab_size+2, 8)
        # Embedding Static (Legs, Hands, Eyes)
        self.emb_static = nn.ModuleList([nn.Embedding(v+2, 4) for v in static_vocab_sizes])
        
        # Embedding Text (Team Name) - Opzionale
        self.use_text = (text_vocab_size > 0)
        text_dim = 0
        if self.use_text:
            self.emb_text = nn.Embedding(text_vocab_size+2, 8)
            text_dim = 8
            
        # Calcolo Input LSTM
        total_survey_dim = len(survey_vocab_sizes) * 4
        total_static_dim = len(static_vocab_sizes) * 4
        
        lstm_input_dim = n_continuous + total_survey_dim + 8 + total_static_dim + text_dim
        
        self.lstm = nn.LSTM(lstm_input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(lstm_hidden, n_classes)

    def forward(self, x_cont, x_survey, x_time, x_static, x_text):
        # x_cont: (Batch, Seq, Feat)
        batch_size, seq_len, _ = x_cont.shape
        
        # 1. Time-Varying Embeddings
        e_surv = [emb(x_survey[:,:,i]) for i, emb in enumerate(self.emb_surveys)] # List of (B, S, 4)
        e_time = self.emb_time(x_time) # (B, S, 8)
        
        # 2. Static Embeddings (Processati come vettori statici poi ripetuti)
        e_stat = [emb(x_static[:,i]) for i, emb in enumerate(self.emb_static)] # List of (B, 4)
        e_stat_cat = torch.cat(e_stat, dim=1) # (B, 12)
        
        if self.use_text:
            e_txt = self.emb_text(x_text) # (B, 8)
            e_stat_cat = torch.cat([e_stat_cat, e_txt], dim=1)
            
        # Ripeti statico per ogni timestep
        e_stat_seq = e_stat_cat.unsqueeze(1).repeat(1, seq_len, 1) # (B, S, Tot_Static)
        
        # 3. Concatena
        full_input = torch.cat([x_cont] + e_surv + [e_time, e_stat_seq], dim=2)
        
        # 4. LSTM
        out, _ = self.lstm(full_input)
        
        # 5. Classifica sull'ultimo step
        logits = self.classifier(self.dropout(out[:, -1, :]))
        return logits

In [5]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for xc, xs, xt, xst, xtxt, y in loader:
        xc, xs, xt, xst, xtxt, y = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device), y.to(device)
        
        optimizer.zero_grad()
        logits = model(xc, xs, xt, xst, xtxt)
        loss = criterion(logits, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_VALUE)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    preds, labels = [], []
    with torch.no_grad():
        for xc, xs, xt, xst, xtxt, y in loader:
            xc, xs, xt, xst, xtxt, y = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device), y.to(device)
            
            logits = model(xc, xs, xt, xst, xtxt)
            loss = criterion(logits, y)
            total_loss += loss.item()
            
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels.extend(y.cpu().numpy())
            
    f1 = f1_score(labels, preds, average='weighted')
    return total_loss / len(loader), f1

In [6]:
print("--- Avvio K-Fold con Advanced Imbalance Strategy ---")

# Calcolo pesi 'smoothed' per Focal Loss Alpha
labels_array = df_labels_raw['label_encoded'].values
counts = np.bincount(labels_array)
weights_smooth = 1. / np.log1p(counts) 
focal_alpha = torch.tensor(weights_smooth / weights_smooth.sum(), dtype=torch.float).to(device)
print(f"Focal Loss Alpha: {focal_alpha.cpu().numpy()}")

all_sample_indices = df_labels_raw['sample_index'].unique()
all_labels_strat = df_labels_raw.set_index('sample_index').loc[all_sample_indices]['label_encoded'].values

skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(all_sample_indices, all_labels_strat)):
    print(f"\n--- Fold {fold+1}/{K_FOLDS} ---")
    
    train_samples = all_sample_indices[train_idx]
    val_samples = all_sample_indices[val_idx]
    
    # 1. Scaling (Fit su Train del fold)
    scaler = StandardScaler()
    train_subset = df_features_engineered[df_features_engineered['sample_index'].isin(train_samples)]
    scaler.fit(train_subset[CONTINUOUS_COLS])
    
    df_fold = df_features_engineered.copy()
    df_fold[CONTINUOUS_COLS] = scaler.transform(df_fold[CONTINUOUS_COLS])
    
    # 2. Dataset
    train_ds = PiratePainDataset(df_fold, df_labels_raw, train_samples, WINDOW_SIZE, STRIDE, TEXT_COL, augment=True)
    val_ds = PiratePainDataset(df_fold, df_labels_raw, val_samples, WINDOW_SIZE, STRIDE, TEXT_COL, augment=False)
    
    # 3. Sampler & Loader
    # IMPORTANTE: Sampler va solo nel Train. Shuffle deve essere False quando c'è il sampler.
    sampler = get_weighted_sampler(train_ds, df_labels_raw)
    
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler, shuffle=False, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    
    # 4. Init Modello
    model = PiratePainModel(len(CONTINUOUS_COLS), survey_vocab_sizes, time_vocab_size, 
                            static_vocab_sizes, TEXT_VOCAB_SIZE, lstm_hidden=128).to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    criterion = FocalLoss(alpha=None, gamma=2.0, label_smoothing=0.1)
    
    best_fold_f1 = 0
    for ep in range(EPOCHS):
        t_loss = train_epoch(model, train_loader, optimizer, criterion)
        v_loss, v_f1 = validate(model, val_loader, criterion)
        
        if v_f1 > best_fold_f1:
            best_fold_f1 = v_f1
        
        if (ep+1) % 10 == 0:
            print(f"Ep {ep+1}: Train Loss {t_loss:.4f} | Val F1 {v_f1:.4f}")
            
    print(f"Fold {fold+1} Best F1: {best_fold_f1:.4f}")
    fold_scores.append(best_fold_f1)

print(f"Media F1 Score: {np.mean(fold_scores):.4f}")

--- Avvio K-Fold con Advanced Imbalance Strategy ---
Focal Loss Alpha: [0.25556704 0.35009953 0.39433342]

--- Fold 1/5 ---
Ep 10: Train Loss 0.0263 | Val F1 0.9060
Ep 20: Train Loss 0.0201 | Val F1 0.9188
Ep 30: Train Loss 0.0221 | Val F1 0.9386
Ep 40: Train Loss 0.0190 | Val F1 0.9291
Ep 50: Train Loss 0.0189 | Val F1 0.9309
Fold 1 Best F1: 0.9397

--- Fold 2/5 ---
Ep 10: Train Loss 0.0278 | Val F1 0.9073
Ep 20: Train Loss 0.0191 | Val F1 0.9106
Ep 30: Train Loss 0.0190 | Val F1 0.9134
Ep 40: Train Loss 0.0189 | Val F1 0.9128
Ep 50: Train Loss 0.0189 | Val F1 0.9112
Fold 2 Best F1: 0.9353

--- Fold 3/5 ---
Ep 10: Train Loss 0.0254 | Val F1 0.9154
Ep 20: Train Loss 0.0197 | Val F1 0.9138
Ep 30: Train Loss 0.0191 | Val F1 0.9113
Ep 40: Train Loss 0.0190 | Val F1 0.9090
Ep 50: Train Loss 0.0189 | Val F1 0.9100
Fold 3 Best F1: 0.9208

--- Fold 4/5 ---
Ep 10: Train Loss 0.0224 | Val F1 0.9173
Ep 20: Train Loss 0.0232 | Val F1 0.9105
Ep 30: Train Loss 0.0204 | Val F1 0.9124
Ep 40: Train Lo

In [7]:
print("\n--- Training Finale Full Dataset & Submission ---")

# 1. Scaling Finale (Fit su tutto il train)
final_scaler = StandardScaler()
final_scaler.fit(df_features_engineered[CONTINUOUS_COLS])

df_train_final = df_features_engineered.copy()
df_train_final[CONTINUOUS_COLS] = final_scaler.transform(df_train_final[CONTINUOUS_COLS])

df_test_final = df_test_engineered.copy()
df_test_final[CONTINUOUS_COLS] = final_scaler.transform(df_test_final[CONTINUOUS_COLS])

# 2. Dataset Finale
# Anche nel training finale usiamo il Sampler per mantenere l'equilibrio appreso
train_ds_final = PiratePainDataset(df_train_final, df_labels_raw, all_sample_indices, WINDOW_SIZE, STRIDE, TEXT_COL)
sampler_final = get_weighted_sampler(train_ds_final, df_labels_raw)
train_loader_final = DataLoader(train_ds_final, batch_size=BATCH_SIZE, sampler=sampler_final, shuffle=False, drop_last=True)

# 3. Dataset Test
sub_indices = pd.read_csv('/kaggle/input/pirate/sample_submission.csv')['sample_index'].unique()
test_ds_final = PiratePainDataset(df_test_final, None, sub_indices, WINDOW_SIZE, STRIDE, TEXT_COL)
test_loader_final = DataLoader(test_ds_final, batch_size=BATCH_SIZE*2, shuffle=False)

# 4. Modello Finale
final_model = PiratePainModel(len(CONTINUOUS_COLS), survey_vocab_sizes, time_vocab_size, 
                              static_vocab_sizes, TEXT_VOCAB_SIZE, lstm_hidden=128).to(device)
optimizer = optim.AdamW(final_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = FocalLoss(alpha=None, gamma=2.0, label_smoothing=0.1)

# 5. Train Loop
final_model.train()
print("Addestramento in corso...")
for ep in range(40): # 40 epoche bastano di solito per il full fit
    loss = train_epoch(final_model, train_loader_final, optimizer, criterion)
    if (ep+1)%10 == 0: print(f"Ep {ep+1}: Loss {loss:.4f}")

# 6. INFERENZA CON SOFT VOTING (Advice 09/11)
print("Generazione Predizioni...")
final_model.eval()
all_logits = []
window_sample_map = [x[0] for x in test_ds_final.indices]

with torch.no_grad():
    for xc, xs, xt, xst, xtxt, _ in test_loader_final:
        xc, xs, xt, xst, xtxt = xc.to(device), xs.to(device), xt.to(device), xst.to(device), xtxt.to(device)
        logits = final_model(xc, xs, xt, xst, xtxt)
        all_logits.extend(logits.cpu().numpy())

# Creazione DataFrame Logits
df_logits = pd.DataFrame(all_logits, columns=[0, 1, 2])
df_logits['sample_index'] = window_sample_map

# Media dei logits per sample_index (Soft Voting)
df_avg_logits = df_logits.groupby('sample_index').mean()

# 1. Converti logits medi in probabilità
probs = torch.softmax(torch.tensor(df_avg_logits.values), dim=1).numpy()

# 2. Applica soglie manuali (Sperimentale: favorisci le classi rare)
final_preds_list = []
for p in probs:
    # Se la probabilità di High Pain è > 25% (non 50%!), predici High Pain
    if p[2] > 0.25:
        final_preds_list.append(2)
    # Altrimenti, se Low Pain è > 30%, predici Low Pain
    elif p[1] > 0.30:
        final_preds_list.append(1)
    # Altrimenti No Pain
    else:
        final_preds_list.append(0)

final_preds = pd.Series(final_preds_list, index=df_avg_logits.index)

# Mappatura Inversa e Salvataggio
inv_map = {v: k for k, v in label_mapping.items()}
submission = final_preds.map(inv_map).reset_index()
submission.columns = ['sample_index', 'label']

# Ordina come da sample_submission
sample_sub = pd.read_csv('/kaggle/input/pirate/sample_submission.csv')
submission = submission.set_index('sample_index').reindex(sample_sub['sample_index']).reset_index()

submission.to_csv('submission.csv', index=False)
print("Fatto! File 'submission.csv' creato.")
print(submission.head())


--- Training Finale Full Dataset & Submission ---
Addestramento in corso...
Ep 10: Loss 0.0281
Ep 20: Loss 0.0215
Ep 30: Loss 0.0190
Ep 40: Loss 0.0189
Generazione Predizioni...
Fatto! File 'submission.csv' creato.
   sample_index    label
0             0  no_pain
1             1  no_pain
2             2  no_pain
3             3  no_pain
4             4  no_pain
