In [12]:
# ============================================================================
# CELL 1: SETUP KAGGLE
# ============================================================================

# Clone repository
#!git clone https://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@github.com/francinze/Ch1_An2DL.git

# Install dependencies
!pip install -q kaggle optuna

# Setup Kaggle credentials
#!mkdir -p ~/.kaggle
#!cp Ch1_An2DL/kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json

# Download competition data
#!kaggle competitions download -c an2dl2526c1
#!unzip -q an2dl2526c1.zip -d Ch1_An2DL/

# Change to working directory
%cd /kaggle/input/ch-an2dl/pytorch/default/1/Ch1_An2DL

# Verify files are present
import os
print("\n‚úÖ Setup complete! Files in directory:")
for file in ['pirate_pain_train.csv', 'pirate_pain_test.csv', 'pirate_pain_train_labels.csv']:
    if os.path.exists(file):
        size_mb = os.path.getsize(file) / (1024 * 1024)
        print(f"   ‚úì {file} ({size_mb:.2f} MB)")
    else:
        print(f"   ‚úó {file} NOT FOUND!")


/kaggle/input/ch-an2dl/pytorch/default/1/Ch1_An2DL

‚úÖ Setup complete! Files in directory:
   ‚úì pirate_pain_train.csv (60.78 MB)
   ‚úì pirate_pain_test.csv (125.24 MB)
   ‚úì pirate_pain_train_labels.csv (0.01 MB)


In [13]:

# ============================================================================
# CELL 2: IMPORTS & SEED SETUP
# ============================================================================
import os
import random
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import optuna
from optuna.pruners import MedianPruner
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
from tqdm import tqdm
import pickle

# Set seed for reproducibility
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
else:
    device = torch.device("cpu")

print(f"üêç Python packages:")
print(f"   PyTorch: {torch.__version__}")
print(f"   Optuna: {optuna.__version__}")
print(f"   Pandas: {pd.__version__}")
print(f"   NumPy: {np.__version__}")
print(f"\nüñ•  Device: {device}")
print(f"   Random seed: {SEED}")


üêç Python packages:
   PyTorch: 2.6.0+cu124
   Optuna: 4.5.0
   Pandas: 2.2.3
   NumPy: 1.26.4

üñ•  Device: cuda
   Random seed: 42


In [14]:

# ============================================================================
# CELL 3: PREPROCESSING FUNCTIONS (INLINE)
# ============================================================================

def add_time_features(df_train, df_test):
    """
    Add time-based features implementing November 12 clue.
    Creates 4 new features from 'time' column:
    - time_normalized: position in sequence [0.0, 1.0]
    - time_sin, time_cos: cyclical encoding
    - time_position: categorical [0=early, 1=mid, 2=late]
    """
    print("\n‚è∞ Adding time-based features...")
    print("=" * 60)

    for df, name in [(df_train, 'train'), (df_test, 'test')]:
        # Convert time to numeric if needed
        if df['time'].dtype == 'object':
            df['time'] = pd.to_datetime(df['time'])
            df['time'] = (df['time'] - df['time'].min()).dt.total_seconds()

        # Feature 1: Normalized time (position in sequence: 0.0 to 1.0)
        df['time_normalized'] = df.groupby('sample_index')['time'].transform(
            lambda x: x / x.max() if x.max() > 0 else 0
        )

        # Analyze sequence lengths for cyclical period
        lengths = df.groupby('sample_index')['time'].max()
        avg_length = lengths.mean()

        # Feature 2 & 3: Cyclical encoding (captures periodic patterns)
        period = max(50, avg_length / 3)  # ~3 cycles per sequence
        df['time_sin'] = np.sin(2 * np.pi * df['time'] / period)
        df['time_cos'] = np.cos(2 * np.pi * df['time'] / period)

        # Feature 4: Time position categories (early/mid/late)
        def categorize_time_position(group):
            normalized = group / group.max() if group.max() > 0 else 0
            return pd.cut(normalized, bins=[0, 0.33, 0.66, 1.0],
                         labels=[0, 1, 2], include_lowest=True).astype(int)

        df['time_position'] = df.groupby('sample_index')['time'].transform(categorize_time_position)

        print(f"‚úÖ {name.capitalize()} set: Added 4 time features")
        print(f"   - Avg sequence length: {avg_length:.1f} timesteps")
        print(f"   - Cyclical period: {period:.1f} timesteps")

    # Show distribution
    print(f"\nüìä Time position distribution:")
    for label, value in [('Early', 0), ('Mid', 1), ('Late', 2)]:
        count = (df_train['time_position'] == value).sum()
        pct = (count / len(df_train)) * 100
        print(f"   {label}: {count:,} ({pct:.1f}%)")

    return df_train, df_test


def add_prosthetics_feature(df, df_test):
    """Create binary prosthetics feature"""
    print("\nü¶æ Creating 'has_prosthetics' feature...")
    print("=" * 60)

    df['has_prosthetics'] = (df['n_legs'] != 'two').astype(int)
    df_test['has_prosthetics'] = (df_test['n_legs'] != 'two').astype(int)

    # Show distribution
    print(f"\nTraining set:")
    train_dist = df['has_prosthetics'].value_counts().sort_index()
    for value, count in train_dist.items():
        label = "Natural" if value == 0 else "Prosthetics"
        pct = (count / len(df)) * 100
        print(f"  {value} ({label:12s}): {count:6,} samples ({pct:.2f}%)")

    print(f"\nTest set:")
    test_dist = df_test['has_prosthetics'].value_counts().sort_index()
    for value, count in test_dist.items():
        label = "Natural" if value == 0 else "Prosthetics"
        pct = (count / len(df_test)) * 100
        print(f"  {value} ({label:12s}): {count:6,} samples ({pct:.2f}%)")

    # Drop original columns
    cols_to_drop = ['n_legs', 'n_hands', 'n_eyes',
                    'n_legs_encoded', 'n_hands_encoded', 'n_eyes_encoded']
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    df_test = df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns])

    print("\n‚úÖ Feature created successfully!")
    return df, df_test


def scale_joint_columns(df, scaler=None):
    """Apply Min-Max normalization to joint columns"""
    print("\nüìè Applying Min-Max normalization...")
    print("=" * 60)

    # All possible joint columns (0-29)
    all_joint_cols = [f"joint_{str(i).zfill(2)}" for i in range(30)]

    # Filter only existing columns (handles joint_11, joint_30 already dropped)
    joint_cols = [c for c in all_joint_cols if c in df.columns]

    print(f"   Found {len(joint_cols)} joint columns to scale")
    print(f"   (Expected 28 after dropping joint_11 and joint_30)")

    # Ensure float32 type
    for col in joint_cols:
        df[col] = df[col].astype(np.float32)

    if scaler is None:
        scaler = MinMaxScaler()
        df[joint_cols] = scaler.fit_transform(df[joint_cols])
        print(f"‚úÖ Scaler fitted on training data")
        print(f"   Min (first 5): {scaler.data_min_[:5]}")
        print(f"   Max (first 5): {scaler.data_max_[:5]}")
    else:
        df[joint_cols] = scaler.transform(df[joint_cols])
        print(f"‚úÖ Scaler applied to test/validation data")

    return df, scaler


def apply_target_weighting(target):
    """Map labels to integers and show distribution"""
    print("\n‚öñ  Processing target labels...")
    print("=" * 60)

    label_mapping = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}

    # Show original distribution
    print("Original label distribution:")
    for label in target['label'].unique():
        count = len(target[target['label'] == label])
        pct = (count / len(target)) * 100
        print(f"  {label}: {count} ({pct:.2f}%)")

    target['label'] = target['label'].map(label_mapping)

    print(f"\n‚úÖ Labels mapped: {label_mapping}")
    return target


def train_val_split(df, target, val_ratio=0.2):
    """Split data by unique users"""
    print(f"\n‚úÇ  Train/validation split ({int((1-val_ratio)*100)}/{int(val_ratio*100)})...")
    print("=" * 60)

    unique_users = df['sample_index'].unique()
    random.seed(SEED)
    random.shuffle(unique_users)

    num_val_users = int(len(unique_users) * val_ratio)
    val_users = unique_users[:num_val_users]
    train_users = unique_users[num_val_users:]

    train_df = df[df['sample_index'].isin(train_users)].reset_index(drop=True)
    val_df = df[df['sample_index'].isin(val_users)].reset_index(drop=True)
    train_target = target[target['sample_index'].isin(train_users)].reset_index(drop=True)
    val_target = target[target['sample_index'].isin(val_users)].reset_index(drop=True)

    print(f"‚úÖ Training users: {len(train_users)}, Validation users: {len(val_users)}")
    print(f"   Training samples: {len(train_df):,}, Validation samples: {len(val_df):,}")

    # Show class distribution
    print(f"\nTraining label distribution:")
    for label in sorted(train_target['label'].unique()):
        count = (train_target['label'] == label).sum()
        pct = (count / len(train_target)) * 100
        print(f"   Class {label}: {count} ({pct:.1f}%)")

    print(f"\nValidation label distribution:")
    for label in sorted(val_target['label'].unique()):
        count = (val_target['label'] == label).sum()
        pct = (count / len(val_target)) * 100
        print(f"   Class {label}: {count} ({pct:.1f}%)")

    return train_df, val_df, train_target, val_target


print("‚úÖ Preprocessing functions defined")


‚úÖ Preprocessing functions defined


In [15]:
# ============================================================================
# CELL 4: WINDOW BUILDING FUNCTION (INLINE)
# ============================================================================

LABEL_MAP = {"no_pain": 0, "low_pain": 1, "high_pain": 2}

def build_windows(df, targets=None, window_size=110, stride=22, feature="3d"):
    """
    Build sliding windows from time-series data.

    Args:
        df: DataFrame with sample_index and feature columns
        targets: DataFrame with labels (None for test set)
        window_size: Size of each window
        stride: Step size between windows
        feature: "3d" for (samples, timesteps, features) shape

    Returns:
        X: numpy array of windows
        y: numpy array of labels (or None)
        sample_mapping: list of (sample_index, window_idx) tuples
    """
    print(f"\nü™ü Building windows (size={window_size}, stride={stride})...")
    print("=" * 60)

    # Detect feature columns (exclude metadata)
    metadata_cols = ['sample_index', 'time', 'Unnamed: 0', 'index']
    data_cols = [c for c in df.columns if c not in metadata_cols]

    print(f"   Feature columns: {len(data_cols)}")
    print(f"   First 5: {data_cols[:5]}")
    print(f"   Last 5: {data_cols[-5:]}")

    X_list = []
    y_list = []
    sample_mapping = []

    unique_samples = sorted(df['sample_index'].unique())

    for sample_idx in unique_samples:
        sample_data = df[df['sample_index'] == sample_idx][data_cols].values

        if len(sample_data) == 0:
            print(f"‚ö†  Warning: Sample {sample_idx} has no data, skipping...")
            continue

        # Get label if available
        if targets is not None:
            label_row = targets[targets['sample_index'] == sample_idx]
            if len(label_row) == 0:
                print(f"‚ö†  Warning: Sample {sample_idx} has no label, skipping...")
                continue

            label_value = label_row['label'].values[0]
            label = LABEL_MAP.get(label_value, label_value) if isinstance(label_value, str) else int(label_value)
        else:
            label = None

        # Sliding window
        for start in range(0, len(sample_data) - window_size + 1, stride):
            window = sample_data[start:start + window_size]

            if window.shape[0] != window_size:
                continue

            X_list.append(window)
            if label is not None:
                y_list.append(label)
            sample_mapping.append((sample_idx, len(X_list) - 1))

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.int64) if y_list else None

    print(f"\n‚úÖ Window creation complete:")
    print(f"   Total windows: {len(X):,}")
    print(f"   From {len(unique_samples)} samples")
    print(f"   Window shape: {X.shape}")

    if y is not None:
        print(f"   Label shape: {y.shape}")
        print(f"   Label distribution: {np.bincount(y)}")

    return X, y, sample_mapping


print("‚úÖ Window building function defined")


‚úÖ Window building function defined


In [16]:
# ============================================================================
# CELL 5: BILSTM MODEL (INLINE)
# ============================================================================

class BiLSTM(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size=128, num_layers=2, dropout_rate=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate if num_layers > 1 else 0.0
        )
        self.attention_weights = nn.Linear(hidden_size * 2, 1)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_classes)
        )

    def attention(self, lstm_output):
        attn_scores = self.attention_weights(lstm_output)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context = torch.sum(attn_weights * lstm_output, dim=1)
        return context

    def forward(self, x):
        lstm_output, _ = self.lstm(x)
        context = self.attention(lstm_output)
        return self.classifier(context)


print("‚úÖ BiLSTM model class defined")


‚úÖ BiLSTM model class defined


In [17]:



# ============================================================================
# CELL 6: TRAINING FUNCTIONS (INLINE)
# ============================================================================

def train_one_epoch(model, train_loader, criterion, optimizer, scaler, device, l1_lambda=0):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    all_preds = []
    all_targets = []

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            if l1_lambda > 0:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss = loss + l1_lambda * l1_norm

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(train_loader)
    f1 = f1_score(all_targets, all_preds, average='macro')

    return avg_loss, f1


def validate_one_epoch(model, val_loader, criterion, device):
    """Validate for one epoch"""
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    f1 = f1_score(all_targets, all_preds, average='macro')

    return avg_loss, f1


def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scaler, device,
        patience=30, l1_lambda=0, verbose=10):
    """Training loop with early stopping"""

    history = {
        'train_loss': [], 'train_f1': [],
        'val_loss': [], 'val_f1': []
    }

    best_val_f1 = 0
    patience_counter = 0

    for epoch in range(epochs):
        train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer, scaler, device, l1_lambda)
        val_loss, val_f1 = validate_one_epoch(model, val_loader, criterion, device)

        history['train_loss'].append(train_loss)
        history['train_f1'].append(train_f1)
        history['val_loss'].append(val_loss)
        history['val_f1'].append(val_f1)

        if verbose > 0 and (epoch + 1) % verbose == 0:
            print(f"Epoch {epoch+1:3d}/{epochs} | "
                  f"Train Loss: {train_loss:.4f} F1: {train_f1:.4f} | "
                  f"Val Loss: {val_loss:.4f} F1: {val_f1:.4f}")

        # Early stopping
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
        else:
            patience_counter += 1
            if patience > 0 and patience_counter >= patience:
                print(f"‚èπ  Early stopping at epoch {epoch+1} (patience={patience})")
                break

    return model, history


def make_loader(ds, batch_size, shuffle, drop_last, sampler=None):
    """Create DataLoader with optimal settings"""
    cpu_cores = os.cpu_count() or 2
    num_workers = max(2, min(4, cpu_cores))

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle if sampler is None else False,
        sampler=sampler,
        drop_last=drop_last,
        num_workers=num_workers,
        pin_memory=True,
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=4,
    )


print("‚úÖ Training functions defined")


‚úÖ Training functions defined


In [18]:

# ============================================================================
# CELL 7: DATA LOADING & PREPROCESSING
# ============================================================================

print("\n" + "="*80)
print("üì¶ LOADING AND PREPROCESSING DATA")
print("="*80)

# Load data
print("\n1Ô∏è‚É£  Loading CSV files...")
df = pd.read_csv("pirate_pain_train.csv")
df_test = pd.read_csv("pirate_pain_test.csv")
target = pd.read_csv("pirate_pain_train_labels.csv")

print(f"   Training data: {df.shape}")
print(f"   Test data: {df_test.shape}")
print(f"   Target data: {target.shape}")

# Drop problematic joints FIRST
print("\n2Ô∏è‚É£  Dropping problematic joints (joint_11, joint_30)...")
df = df.drop(columns=['joint_30', 'joint_11'], errors='ignore')
df_test = df_test.drop(columns=['joint_30', 'joint_11'], errors='ignore')
print(f"   Training data after drop: {df.shape}")
print(f"   Test data after drop: {df_test.shape}")

# Add time features BEFORE dropping time column
print("\n3Ô∏è‚É£  Adding time-based features...")
df, df_test = add_time_features(df, df_test)

# Add prosthetics feature
print("\n4Ô∏è‚É£  Adding prosthetics feature...")
df, df_test = add_prosthetics_feature(df, df_test)

# Drop time column AFTER extracting features
print("\n5Ô∏è‚É£  Dropping 'time' column...")
df = df.drop(columns='time', errors='ignore')
df_test = df_test.drop(columns='time', errors='ignore')
print(f"   Training data: {df.shape}")
print(f"   Test data: {df_test.shape}")

# Scale joint columns
print("\n6Ô∏è‚É£  Scaling joint columns...")
df, scaler = scale_joint_columns(df, scaler=None)
df_test, _ = scale_joint_columns(df_test, scaler=scaler)

# Process targets
print("\n7Ô∏è‚É£  Processing target labels...")
target = apply_target_weighting(target)

# Train/Val split
print("\n8Ô∏è‚É£  Splitting train/validation...")
train_df, val_df, train_target, val_target = train_val_split(df, target, val_ratio=0.2)

# FINAL CHECK: Verify no 'time' column exists
print("\n9Ô∏è‚É£  Final column check...")
print(f"   Training columns ({len(train_df.columns)} total): {list(train_df.columns)[:10]}...")
print(f"   Validation columns ({len(val_df.columns)} total): {list(val_df.columns)[:10]}...")
print(f"   Test columns ({len(df_test.columns)} total): {list(df_test.columns)[:10]}...")

# Verify 'time' is not in columns
assert 'time' not in train_df.columns, "‚ùå ERROR: 'time' still in train_df!"
assert 'time' not in val_df.columns, "‚ùå ERROR: 'time' still in val_df!"
assert 'time' not in df_test.columns, "‚ùå ERROR: 'time' still in df_test!"

print("\n‚úÖ Data preprocessing complete!")
print(f"   Training set: {len(train_df):,} samples ({len(train_df['sample_index'].unique())} users)")
print(f"   Validation set: {len(val_df):,} samples ({len(val_df['sample_index'].unique())} users)")
print(f"   Test set: {len(df_test):,} samples ({len(df_test['sample_index'].unique())} users)")



üì¶ LOADING AND PREPROCESSING DATA

1Ô∏è‚É£  Loading CSV files...
   Training data: (105760, 40)
   Test data: (211840, 40)
   Target data: (661, 2)

2Ô∏è‚É£  Dropping problematic joints (joint_11, joint_30)...
   Training data after drop: (105760, 38)
   Test data after drop: (211840, 38)

3Ô∏è‚É£  Adding time-based features...

‚è∞ Adding time-based features...
‚úÖ Train set: Added 4 time features
   - Avg sequence length: 159.0 timesteps
   - Cyclical period: 53.0 timesteps
‚úÖ Test set: Added 4 time features
   - Avg sequence length: 159.0 timesteps
   - Cyclical period: 53.0 timesteps

üìä Time position distribution:
   Early: 35,033 (33.1%)
   Mid: 34,372 (32.5%)
   Late: 36,355 (34.4%)

4Ô∏è‚É£  Adding prosthetics feature...

ü¶æ Creating 'has_prosthetics' feature...

Training set:
  0 (Natural     ): 104,800 samples (99.09%)
  1 (Prosthetics ):    960 samples (0.91%)

Test set:
  0 (Natural     ): 209,760 samples (99.02%)
  1 (Prosthetics ):  2,080 samples (0.98%)

‚úÖ Feat

In [19]:

# ============================================================================
# CELL 8: K-FOLD SETUP
# ============================================================================

print("\n" + "="*80)
print("üìä SETTING UP K-FOLD CROSS-VALIDATION")
print("="*80)

K_FOLDS = 5
WINDOW_SIZE = 110
STRIDE = 22

# Build windows from training data
X_train, y_train, _ = build_windows(train_df, train_target, WINDOW_SIZE, STRIDE, feature="3d")
X_val, y_val, _ = build_windows(val_df, val_target, WINDOW_SIZE, STRIDE, feature="3d")

print(f"\nüìê Data shapes:")
print(f"   Training: X={X_train.shape}, y={y_train.shape}")
print(f"   Validation: X={X_val.shape}, y={y_val.shape}")

# Class distribution
class_counts = np.bincount(y_train.astype(int))
print(f"\nüìä Training class distribution:")
for cls, count in enumerate(class_counts):
    print(f"   Class {cls}: {count:,} samples ({count/len(y_train)*100:.1f}%)")

# Store metadata
input_shape = X_train.shape
num_classes = len(np.unique(y_train))

print(f"\n‚úÖ K-Fold setup complete")
print(f"   K={K_FOLDS} folds")
print(f"   Input features: {input_shape[-1]}")
print(f"   Number of classes: {num_classes}")

# Create K-Fold splits (fixed, will be reused for all Optuna trials)
kfold = KFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
fold_indices = list(kfold.split(X_train))

print(f"   Fold splits created: {len(fold_indices)} folds ready")



üìä SETTING UP K-FOLD CROSS-VALIDATION

ü™ü Building windows (size=110, stride=22)...
   Feature columns: 38
   First 5: ['pain_survey_1', 'pain_survey_2', 'pain_survey_3', 'pain_survey_4', 'joint_00']
   Last 5: ['time_normalized', 'time_sin', 'time_cos', 'time_position', 'has_prosthetics']

‚úÖ Window creation complete:
   Total windows: 1,587
   From 529 samples
   Window shape: (1587, 110, 38)
   Label shape: (1587,)
   Label distribution: [1251  204  132]

ü™ü Building windows (size=110, stride=22)...
   Feature columns: 38
   First 5: ['pain_survey_1', 'pain_survey_2', 'pain_survey_3', 'pain_survey_4', 'joint_00']
   Last 5: ['time_normalized', 'time_sin', 'time_cos', 'time_position', 'has_prosthetics']

‚úÖ Window creation complete:
   Total windows: 396
   From 132 samples
   Window shape: (396, 110, 38)
   Label shape: (396,)
   Label distribution: [282  78  36]

üìê Data shapes:
   Training: X=(1587, 110, 38), y=(1587,)
   Validation: X=(396, 110, 38), y=(396,)

üìä Tra

In [20]:

# ============================================================================
# CELL 9: OPTUNA OBJECTIVE FUNCTION
# ============================================================================

def optuna_objective(trial):
    """
    Optuna objective function.
    Trains model on K-Fold and returns average validation F1.
    """

    # ========================================================================
    # SUGGEST HYPERPARAMETERS
    # ========================================================================
    hidden_size = trial.suggest_categorical('hidden_size', [64, 128, 256, 512])
    num_layers = trial.suggest_int('num_layers', 1, 3)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5, step=0.1)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    l1_lambda = trial.suggest_categorical('l1_lambda', [0, 0.001, 0.01])
    l2_lambda = trial.suggest_categorical('l2_lambda', [0, 1e-5, 1e-4, 1e-3])

    # ========================================================================
    # K-FOLD TRAINING
    # ========================================================================
    fold_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(fold_indices):
        # Split data for this fold
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

        # Create datasets
        train_ds = TensorDataset(torch.from_numpy(X_fold_train).float(), torch.from_numpy(y_fold_train).long())
        val_ds = TensorDataset(torch.from_numpy(X_fold_val).float(), torch.from_numpy(y_fold_val).long())

        # Weighted sampling for class imbalance
        fold_class_counts = np.bincount(y_fold_train.astype(int))
        class_weights_sampling = 1.0 / fold_class_counts
        class_weights_sampling = class_weights_sampling / np.sum(class_weights_sampling)
        sample_weights = class_weights_sampling[y_fold_train.astype(int)]
        sample_weights = torch.from_numpy(sample_weights).float()

        sampler = WeightedRandomSampler(
            weights=sample_weights,
            num_samples=len(sample_weights),
            replacement=True
        )

        # Create data loaders
        train_loader = make_loader(train_ds, batch_size=batch_size, shuffle=False, drop_last=True, sampler=sampler)
        val_loader = make_loader(val_ds, batch_size=batch_size, shuffle=False, drop_last=False)

        # Create model
        model = BiLSTM(
            input_size=input_shape[-1],
            num_classes=num_classes,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout_rate=dropout_rate
        ).to(device)

        # Loss & Optimizer
        fold_class_weights_loss = len(y_fold_train) / (len(fold_class_counts) * fold_class_counts)
        fold_class_weights_loss = torch.tensor(fold_class_weights_loss, dtype=torch.float32).to(device)
        criterion = nn.CrossEntropyLoss(weight=fold_class_weights_loss)

        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=l2_lambda
        )

        scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

        # Train
        _, history = fit(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            epochs=30,
            criterion=criterion,
            optimizer=optimizer,
            scaler=scaler,
            device=device,
            patience=50,
            l1_lambda=l1_lambda,
            verbose=0  # Silent during Optuna
        )

        # Get best F1 for this fold
        best_f1 = max(history['val_f1'])
        fold_scores.append(best_f1)

        # Report intermediate value for pruning
        trial.report(best_f1, fold_idx)

        # Prune if performing poorly
        if trial.should_prune():
            raise optuna.TrialPruned()

    # Return average F1 across all folds
    avg_f1 = np.mean(fold_scores)
    return avg_f1


print("‚úÖ Optuna objective function defined")


‚úÖ Optuna objective function defined


In [21]:

# ============================================================================
# CELL 10: RUN OPTUNA OPTIMIZATION
# ============================================================================

print("\n" + "="*80)
print("üîç STARTING OPTUNA HYPERPARAMETER OPTIMIZATION")
print("="*80)

# Optuna configuration
N_TRIALS = 50
TIMEOUT = 6 * 3600  # 6 hours

# Create pruner
pruner = MedianPruner(
    n_startup_trials=5,
    n_warmup_steps=30,
    interval_steps=10
)

# Create study
study = optuna.create_study(
    direction='maximize',
    pruner=pruner,
    study_name='bilstm_kfold_optimization'
)

print(f"\n‚öô  Configuration:")
print(f"   Trials: {N_TRIALS}")
print(f"   Timeout: {TIMEOUT/3600:.1f} hours")
print(f"   K-Folds: {K_FOLDS}")
print(f"   Epochs per trial: 200 (with patience=30)")
print(f"   Pruning: Enabled (MedianPruner)")

print(f"\nüöÄ Starting optimization...")
print(f"   This will take approximately 5-8 hours")
print("="*80)

# Run optimization
study.optimize(
    optuna_objective,
    n_trials=N_TRIALS,
    timeout=TIMEOUT,
    show_progress_bar=True
)

print("\n" + "="*80)
print("‚úÖ OPTUNA OPTIMIZATION COMPLETE!")
print("="*80)

# Best trial
best_trial = study.best_trial
print(f"\nüèÜ Best Trial:")
print(f"   Trial number: {best_trial.number}")
print(f"   Best F1 score: {best_trial.value:.4f}")
print(f"\nüéØ Best Hyperparameters:")
for key, value in best_trial.params.items():
    print(f"   {key}: {value}")

# Save study
with open('optuna_study_bilstm.pkl', 'wb') as f:
    pickle.dump(study, f)
print(f"\nüíæ Study saved to 'optuna_study_bilstm.pkl'")


[I 2025-11-17 00:50:36,877] A new study created in memory with name: bilstm_kfold_optimization



üîç STARTING OPTUNA HYPERPARAMETER OPTIMIZATION

‚öô  Configuration:
   Trials: 50
   Timeout: 6.0 hours
   K-Folds: 5
   Epochs per trial: 200 (with patience=30)
   Pruning: Enabled (MedianPruner)

üöÄ Starting optimization...
   This will take approximately 5-8 hours


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-17 00:51:31,005] Trial 0 finished with value: 0.07209058292124977 and parameters: {'hidden_size': 128, 'num_layers': 1, 'dropout_rate': 0.4, 'learning_rate': 0.00021175049189129955, 'batch_size': 128, 'l1_lambda': 0.01, 'l2_lambda': 0}. Best is trial 0 with value: 0.07209058292124977.
[I 2025-11-17 00:55:03,768] Trial 1 finished with value: 0.05114730463002202 and parameters: {'hidden_size': 256, 'num_layers': 3, 'dropout_rate': 0.30000000000000004, 'learning_rate': 0.0002882158121158541, 'batch_size': 32, 'l1_lambda': 0.001, 'l2_lambda': 0}. Best is trial 0 with value: 0.07209058292124977.
[I 2025-11-17 00:56:05,870] Trial 2 finished with value: 0.05114730463002202 and parameters: {'hidden_size': 64, 'num_layers': 1, 'dropout_rate': 0.4, 'learning_rate': 0.006025500418073251, 'batch_size': 64, 'l1_lambda': 0.001, 'l2_lambda': 0.001}. Best is trial 0 with value: 0.07209058292124977.
[I 2025-11-17 01:00:21,811] Trial 3 finished with value: 0.5055405845835693 and parameters: {

OSError: [Errno 30] Read-only file system: 'optuna_study_bilstm.pkl'

In [1]:

# ============================================================================
# CELL 11: OPTUNA RESULTS & VISUALIZATION
# ============================================================================

print("\n" + "="*80)
print("üìä OPTUNA RESULTS ANALYSIS")
print("="*80)

# Trials DataFrame
trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values('value', ascending=False)

print(f"\nüèÜ Top 10 Trials:")
print(trials_df[['number', 'value', 'params_hidden_size', 'params_num_layers',
                 'params_dropout_rate', 'params_learning_rate', 'params_batch_size']].head(10))

# Save results
trials_df.to_csv('optuna_results_bilstm.csv', index=False)
print(f"\nüíæ Results saved to 'optuna_results_bilstm.csv'")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Optimization history
ax = axes[0, 0]
ax.plot([t.value for t in study.trials if t.value is not None], marker='o', alpha=0.6)
ax.set_xlabel('Trial')
ax.set_ylabel('F1 Score')
ax.set_title('Optimization History')
ax.grid(alpha=0.3)

# 2. Hyperparameter importance
try:
    importance = optuna.importance.get_param_importances(study)
    ax = axes[0, 1]
    params = list(importance.keys())
    values = list(importance.values())
    ax.barh(params, values)
    ax.set_xlabel('Importance')
    ax.set_title('Hyperparameter Importance')
    ax.grid(alpha=0.3)
except:
    axes[0, 1].text(0.5, 0.5, 'Not enough trials\nfor importance analysis',
                    ha='center', va='center')

# 3. Learning rate vs F1
ax = axes[1, 0]
lrs = [t.params['learning_rate'] for t in study.trials if t.value is not None]
f1s = [t.value for t in study.trials if t.value is not None]
ax.scatter(lrs, f1s, alpha=0.6)
ax.set_xscale('log')
ax.set_xlabel('Learning Rate')
ax.set_ylabel('F1 Score')
ax.set_title('Learning Rate vs F1')
ax.grid(alpha=0.3)

# 4. Hidden size vs F1
ax = axes[1, 1]
hidden_sizes = [t.params['hidden_size'] for t in study.trials if t.value is not None]
ax.scatter(hidden_sizes, f1s, alpha=0.6)
ax.set_xlabel('Hidden Size')
ax.set_ylabel('F1 Score')
ax.set_title('Hidden Size vs F1')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('optuna_analysis_bilstm.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Visualizations saved to 'optuna_analysis_bilstm.png'")



üìä OPTUNA RESULTS ANALYSIS


NameError: name 'study' is not defined

In [None]:

# ============================================================================
# CELL 12: FINAL TRAINING WITH BEST HYPERPARAMETERS
# ============================================================================

print("\n" + "="*80)
print("üéØ FINAL TRAINING WITH BEST HYPERPARAMETERS")
print("="*80)

# Extract best hyperparameters
best_params = best_trial.params
print(f"\n‚öô  Using hyperparameters:")
for key, value in best_params.items():
    print(f"   {key}: {value}")

# Create datasets
train_ds = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_ds = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())

# Weighted sampling
train_class_counts = np.bincount(y_train.astype(int))
class_weights_sampling = 1.0 / train_class_counts
class_weights_sampling = class_weights_sampling / np.sum(class_weights_sampling)
sample_weights = class_weights_sampling[y_train.astype(int)]
sample_weights = torch.from_numpy(sample_weights).float()

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Create data loaders
train_loader = make_loader(train_ds, batch_size=best_params['batch_size'], shuffle=False, drop_last=True, sampler=sampler)
val_loader = make_loader(val_ds, batch_size=best_params['batch_size'], shuffle=False, drop_last=False)

# Create best model
best_model = BiLSTM(
    input_size=input_shape[-1],
    num_classes=num_classes,
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    dropout_rate=best_params['dropout_rate']
).to(device)

# Loss & Optimizer
class_weights_loss = len(y_train) / (len(train_class_counts) * train_class_counts)
class_weights_loss = torch.tensor(class_weights_loss, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights_loss)

optimizer = torch.optim.AdamW(
    best_model.parameters(),
    lr=best_params['learning_rate'],
    weight_decay=best_params['l2_lambda']
)

scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

print(f"\nüöÄ Starting final training (500 epochs, patience=50)...")
print("="*80)

# Train
_, history = fit(
    model=best_model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=500,
    criterion=criterion,
    optimizer=optimizer,
    scaler=scaler,
    device=device,
    patience=50,
    l1_lambda=best_params['l1_lambda'],
    verbose=10
)

# Save model
torch.save(best_model.state_dict(), 'best_bilstm_model.pt')
print(f"\nüíæ Model saved to 'best_bilstm_model.pt'")

# Plot history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(history['train_loss'], label='Training', alpha=0.3, linestyle='--')
ax1.plot(history['val_loss'], label='Validation', alpha=0.9)
ax1.set_title('Loss')
ax1.legend()
ax1.grid(alpha=0.3)

ax2.plot(history['train_f1'], label='Training', alpha=0.3, linestyle='--')
ax2.plot(history['val_f1'], label='Validation', alpha=0.9)
ax2.set_title('F1 Score')
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('final_training_history_bilstm.png', dpi=150)
plt.show()

print(f"\n‚úÖ Final training complete!")
print(f"   Best Val F1: {max(history['val_f1']):.4f}")
print(f"   Final Val F1: {history['val_f1'][-1]:.4f}")


In [None]:

# ============================================================================
# CELL 13: CONFUSION MATRIX & CLASSIFICATION REPORT
# ============================================================================

print("\n" + "="*80)
print("üìà VALIDATION SET EVALUATION")
print("="*80)

# Get predictions
best_model.eval()
val_preds = []
val_true = []

with torch.no_grad():
    for inputs, targets in val_loader:
        inputs = inputs.to(device)
        outputs = best_model(inputs)
        preds = outputs.argmax(dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(targets.cpu().numpy())

# Classification report
print("\nüìä Classification Report:")
print("="*80)
print(classification_report(
    val_true, val_preds,
    target_names=['no_pain', 'low_pain', 'high_pain'],
    digits=4
))

# Confusion matrix
cm = confusion_matrix(val_true, val_preds)
class_labels = ['no_pain', 'low_pain', 'high_pain']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Best BiLSTM Model')
plt.savefig('confusion_matrix_bilstm.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Confusion matrix saved to 'confusion_matrix_bilstm.png'")


In [None]:

# ============================================================================
# CELL 14: INFERENCE & SUBMISSION
# ============================================================================

print("\n" + "="*80)
print("üîÆ INFERENCE ON TEST SET")
print("="*80)

# Build test windows
X_test, _, _ = build_windows(df_test, None, WINDOW_SIZE, STRIDE, feature="3d")
test_loader = make_loader(
    TensorDataset(torch.from_numpy(X_test).float()),
    batch_size=32,
    shuffle=False,
    drop_last=False
)

# Generate predictions for all windows
all_window_preds = []
best_model.eval()

with torch.no_grad():
    for xb in test_loader:
        xb = xb[0].to(device)
        outputs = best_model(xb)
        preds = outputs.argmax(dim=1)
        all_window_preds.extend(preds.cpu().numpy())

print(f"\nüìä Generated {len(all_window_preds)} window predictions")

# Aggregate predictions per pirate (majority voting)
num_test_samples = len(df_test['sample_index'].unique())
windows_per_sample = len(all_window_preds) // num_test_samples

print(f"   Test samples: {num_test_samples}")
print(f"   Windows per sample: {windows_per_sample}")

label_mapping = {0: 'no_pain', 1: 'low_pain', 2: 'high_pain'}
final_predictions = []

for sample_idx in range(num_test_samples):
    start_idx = sample_idx * windows_per_sample
    end_idx = start_idx + windows_per_sample
    window_preds = all_window_preds[start_idx:end_idx]

    # Majority voting
    most_common = Counter(window_preds).most_common(1)[0][0]
    final_predictions.append(label_mapping[most_common])

print(f"\n‚úÖ Aggregated to {len(final_predictions)} final predictions (one per pirate)")

# Create submission CSV
predictions_df = pd.DataFrame({
    'sample_index': np.arange(num_test_samples),
    'label': final_predictions
})

timestamp = datetime.now().strftime("%Y%m%d_%H%M")
filename = f'predictions_bilstm_optuna_{timestamp}.csv'
predictions_df.to_csv(filename, index=False)

print(f"\nüíæ Predictions saved to: {filename}")
print(f"   Total predictions: {len(final_predictions)}")
print(f"\nüìä Distribution:")
for label in ['no_pain', 'low_pain', 'high_pain']:
    count = final_predictions.count(label)
    pct = (count / len(final_predictions)) * 100
    print(f"   {label:10s}: {count:5d} ({pct:5.2f}%)")

print("\n" + "="*80)
print("‚úÖ ALL DONE! üéâ")
print("="*80)
print("\nGenerated files:")
print("  üìÑ optuna_study_bilstm.pkl")
print("  üìÑ optuna_results_bilstm.csv")
print("  üìÑ best_bilstm_model.pt")
print("  üìä optuna_analysis_bilstm.png")
print("  üìä final_training_history_bilstm.png")
print("  üìä confusion_matrix_bilstm.png")
print(f"  üìÑ {filename}")