# Solution 3: Feature Engineering + RNN

**Hypothesis**: Raw sensor sequences lack information. Add engineered statistical features per window.

**Approach**:
- Add per-window statistics: mean, std, min, max, range for each feature
- Include pain survey features (currently unused!)
- Use enhanced features with GRU
- Direct oversampling for balance

In [1]:
# Set seed for reproducibility
SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import random
import numpy as np
np.random.seed(SEED)
random.seed(SEED)

import torch
torch.manual_seed(SEED)
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
else:
    device = torch.device("cpu")

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

PyTorch version: 2.9.0
Device: cpu


In [2]:
# Load data
df = pd.read_csv("pirate_pain_train.csv")
target = pd.read_csv("pirate_pain_train_labels.csv")

# Check pain survey columns
pain_survey_cols = [col for col in df.columns if 'pain_survey' in col]
print(f"Pain survey columns found: {pain_survey_cols}")
print(f"Sample pain survey values:")
print(df[pain_survey_cols].head())

Pain survey columns found: ['pain_survey_1', 'pain_survey_2', 'pain_survey_3', 'pain_survey_4']
Sample pain survey values:
   pain_survey_1  pain_survey_2  pain_survey_3  pain_survey_4
0              2              0              2              1
1              2              2              2              2
2              2              0              2              2
3              2              2              2              2
4              2              2              2              2


In [3]:
# Preprocess base features
number_cols = ['n_legs', 'n_hands', 'n_eyes']
for col in number_cols:
    df[col] = df[col].astype('category').cat.codes

joint_cols = ["joint_" + str(i).zfill(2) for i in range(31)]
for col in joint_cols:
    df[col] = df[col].astype(np.float32)

minmax_scaler = MinMaxScaler()
df[joint_cols] = minmax_scaler.fit_transform(df[joint_cols])

# Map labels
label_mapping = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
target['label'] = target['label'].map(label_mapping)

# Base features: number cols + joint cols
base_features = number_cols + joint_cols
print(f"Base features: {len(base_features)}")

Base features: 34


In [None]:
# Build sequences WITH FEATURE ENGINEERING
WINDOW_SIZE = 300
STRIDE = 50

def compute_window_stats(window_data):
    """Compute statistical features for a window"""
    stats = []
    stats.append(np.mean(window_data, axis=0))  # Mean per feature
    stats.append(np.std(window_data, axis=0))   # Std per feature
    stats.append(np.min(window_data, axis=0))   # Min per feature
    stats.append(np.max(window_data, axis=0))   # Max per feature
    stats.append(np.max(window_data, axis=0) - np.min(window_data, axis=0))  # Range
    return np.concatenate(stats)

def build_sequences_with_features(df, target_df, window=300, stride=50):
    dataset = []
    labels = []
    
    for id in df['sample_index'].unique():
        sample_df = df[df['sample_index'] == id]
        temp = sample_df[base_features].values
        label = target_df[target_df['sample_index'] == id]['label'].values[0]
        
        # Get pain survey values (constant per sample)
        pain_surveys = sample_df[pain_survey_cols].iloc[0].values.astype(np.float32)
        
        padding_len = window - len(temp) % window
        padding = np.zeros((padding_len, len(base_features)), dtype='float32')
        temp = np.concatenate((temp, padding))
        
        idx = 0
        while idx + window <= len(temp):
            window_data = temp[idx:idx + window]
            
            # Compute statistical features for this window
            window_stats = compute_window_stats(window_data)
            
            # Combine: window stats + pain surveys
            # Total: 34*5 (stats) + len(pain_surveys) = 170 + pain_surveys
            enhanced_features = np.concatenate([window_stats, pain_surveys])
            
            # FIX: Repeat enhanced features for each timestep so shapes match
            # Shape: (window, base_features + enhanced_features)
            enhanced_features_repeated = np.tile(enhanced_features, (window, 1))
            
            # Concatenate with raw window data along feature axis
            # Shape: (window, base_features + enhanced_features)
            full_sequence = np.concatenate([window_data, enhanced_features_repeated], axis=1)
            
            dataset.append(full_sequence)
            labels.append(label)
            idx += stride
    
    return np.array(dataset), np.array(labels)

print("Building sequences with enhanced features...")
print("Features: raw sequence + window statistics + pain surveys")

Building sequences with enhanced features...
Features: raw sequence + window statistics + pain surveys


In [5]:
# Split and build sequences
unique_users = df['sample_index'].unique()
random.seed(SEED)
random.shuffle(unique_users)

train_users, val_users = train_test_split(unique_users, test_size=0.2, random_state=SEED)

df_train = df[df['sample_index'].isin(unique_users[train_users])]
df_val = df[df['sample_index'].isin(unique_users[val_users])]

X_train, y_train = build_sequences_with_features(df_train, target, WINDOW_SIZE, STRIDE)
X_val, y_val = build_sequences_with_features(df_val, target, WINDOW_SIZE, STRIDE)

print(f"\nEnhanced training sequences: {X_train.shape}")
print(f"  Sequence length: {X_train.shape[1]} timesteps")
print(f"  Features per timestep: {X_train.shape[2]}")
print(f"  (First timestep has stats+surveys, rest is raw data)")

print(f"\nValidation sequences: {X_val.shape}")

train_counts = np.bincount(y_train.astype(int))
print(f"\nClass distribution:")
for cls, count in enumerate(train_counts):
    print(f"  Class {cls}: {count} ({count/len(y_train)*100:.1f}%)")

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 174 and the array at index 1 has size 34

In [None]:
# Balance dataset
target_count = train_counts[0]
duplication_factors = np.ceil(target_count / train_counts).astype(int)

X_train_balanced = []
y_train_balanced = []

for cls in range(len(train_counts)):
    cls_indices = np.where(y_train == cls)[0]
    for _ in range(duplication_factors[cls]):
        X_train_balanced.append(X_train[cls_indices])
        y_train_balanced.append(y_train[cls_indices])

X_train_balanced = np.concatenate(X_train_balanced, axis=0)
y_train_balanced = np.concatenate(y_train_balanced, axis=0)

balanced_counts = np.bincount(y_train_balanced.astype(int))
print(f"Balanced dataset: {len(y_train_balanced)} samples")
for cls, count in enumerate(balanced_counts):
    print(f"  Class {cls}: {count} ({count/len(y_train_balanced)*100:.1f}%)")

In [None]:
# GRU Model (handles variable feature dimension)
class RecurrentClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_rate=0.2):
        super().__init__()
        
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        dropout_val = dropout_rate if num_layers > 1 else 0
        
        self.rnn = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_val
        )
        
        self.classifier = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        _, hidden = self.rnn(x)
        logits = self.classifier(hidden[-1])
        return logits

# Initialize model
model = RecurrentClassifier(
    input_size=X_train.shape[-1],  # Enhanced feature dimension
    hidden_size=128,
    num_layers=2,
    num_classes=3,
    dropout_rate=0.2
).to(device)

print(f"Model created with enhanced features")
print(f"Input features: {X_train.shape[-1]}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Create dataloaders
BATCH_SIZE = 16

train_ds = TensorDataset(torch.from_numpy(X_train_balanced).float(), torch.from_numpy(y_train_balanced).long())
val_ds = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print(f"DataLoaders ready")

In [None]:
# Training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

def train_epoch(model, loader, criterion, optimizer, scaler, device):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item() * inputs.size(0)
        all_preds.extend(outputs.argmax(1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(loader.dataset)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return avg_loss, f1

def val_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            
            total_loss += loss.item() * inputs.size(0)
            all_preds.extend(outputs.argmax(1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(loader.dataset)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return avg_loss, f1, all_preds, all_labels

print("=" * 70)
print("Training with Feature Engineering")
print("=" * 70)

EPOCHS = 30
best_f1 = 0

for epoch in range(1, EPOCHS + 1):
    train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, scaler, device)
    val_loss, val_f1, val_preds, val_labels = val_epoch(model, val_loader, criterion, device)
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'models/feature_eng_best.pt')
    
    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d}/{EPOCHS} | Train: Loss={train_loss:.4f}, F1={train_f1:.4f} | Val: Loss={val_loss:.4f}, F1={val_f1:.4f}")

print("\n" + "=" * 70)
print(f"Best validation F1: {best_f1:.4f}")
print("=" * 70)

In [None]:
# Final evaluation
model.load_state_dict(torch.load('models/feature_eng_best.pt'))
_, val_f1, val_preds, val_labels = val_epoch(model, val_loader, criterion, device)

print("\nüìä FINAL RESULTS (Feature Engineering):")
print(f"Validation F1: {val_f1:.4f}")

print("\nüìã Per-class metrics:")
print(classification_report(val_labels, val_preds, target_names=['no_pain', 'low_pain', 'high_pain'], digits=4))

unique_preds, counts = np.unique(val_preds, return_counts=True)
print("\nüéØ Prediction distribution:")
for cls, count in zip(unique_preds, counts):
    print(f"  Class {cls}: {count} predictions ({count/len(val_preds)*100:.1f}%)")

if len(unique_preds) >= 3:
    print("\n‚úÖ SUCCESS: Model predicts ALL 3 classes!")
    print("Feature engineering helped!")
elif len(unique_preds) == 2:
    print("\n‚ö†Ô∏è  PARTIAL: Predicts 2 out of 3 classes")
else:
    print("\n‚ùå FAILED: Still stuck on 1 class")