## ‚öôÔ∏è **Libraries Import**

In [None]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
from torch.utils.data import DataLoader

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

## ‚è≥ **Data Loading**

In [None]:
df = pd.read_csv("pirate_pain_train.csv")
df_test = pd.read_csv("pirate_pain_test.csv")
df = df.drop(columns=['joint_30'])
df_test = df_test.drop(columns=['joint_30'])

print("Training data shape:", df.shape)

Training data shape: (105760, 39)


In [None]:
# Create binary 'has_prosthetics' feature (0 = all natural, 1 = has prosthetics)
print("Creating consolidated feature: 'has_prosthetics'")
print("=" * 60)

# Create the new feature
df['has_prosthetics'] = (df['n_legs'] != 'two').astype(int)
df_test['has_prosthetics'] = (df_test['n_legs'] != 'two').astype(int)

# Show the mapping
print("\nMapping:")
print("  has_prosthetics = 0 ‚Üí All natural body parts (two legs, two hands, two eyes)")
print("  has_prosthetics = 1 ‚Üí Has prosthetics (peg leg, hook hand, eye patch)")

# Show distribution
print("\n" + "=" * 60)
print("Distribution of new feature:")
print("=" * 60)
print("\nTraining set:")
train_dist = df['has_prosthetics'].value_counts().sort_index()
for value, count in train_dist.items():
    label = "Natural" if value == 0 else "Prosthetics"
    pct = (count / len(df)) * 100
    print(f"  {value} ({label:12s}): {count:6,} samples ({pct:.2f}%)")

print("\nTest set:")
test_dist = df_test['has_prosthetics'].value_counts().sort_index()
for value, count in test_dist.items():
    label = "Natural" if value == 0 else "Prosthetics"
    pct = (count / len(df_test)) * 100
    print(f"  {value} ({label:12s}): {count:6,} samples ({pct:.2f}%)")


# Columns to drop
cols_to_drop = ['n_legs', 'n_hands', 'n_eyes', 
                'n_legs_encoded', 'n_hands_encoded', 'n_eyes_encoded']

# Drop from both train and test
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
df_test = df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns])

print("\nFeature created successfully!")

Creating consolidated feature: 'has_prosthetics'

Mapping:
  has_prosthetics = 0 ‚Üí All natural body parts (two legs, two hands, two eyes)
  has_prosthetics = 1 ‚Üí Has prosthetics (peg leg, hook hand, eye patch)

Distribution of new feature:

Training set:
  0 (Natural     ): 104,800 samples (99.09%)
  1 (Prosthetics ):    960 samples (0.91%)

Test set:
  0 (Natural     ): 209,760 samples (99.02%)
  1 (Prosthetics ):  2,080 samples (0.98%)

Feature created successfully!


In [None]:
from sklearn.preprocessing import MinMaxScaler

# List of joint columns to normalize
joint_cols = ["joint_" + str(i).zfill(2) for i in range(30)]

for col in joint_cols:
  df[col] = df[col].astype(np.float32)

# Initialize the MinMaxScaler
minmax_scaler = MinMaxScaler()

# Apply Min-Max normalization to the joint columns
df[joint_cols] = minmax_scaler.fit_transform(df[joint_cols])

data_cols = ['has_prosthetics'] + joint_cols

display(df.head())

Unnamed: 0,sample_index,time,pain_survey_1,pain_survey_2,pain_survey_3,pain_survey_4,joint_00,joint_01,joint_02,joint_03,...,joint_21,joint_22,joint_23,joint_24,joint_25,joint_26,joint_27,joint_28,joint_29,has_prosthetics
0,0,0,2,0,2,1,0.777507,0.738252,0.779512,0.804419,...,2.426544e-06,1.374706e-06,1.5e-05,0.0003162813,4e-06,0.014214,0.011376,0.018978,0.020291,0
1,0,1,2,2,2,2,0.806256,0.765147,0.761153,0.838021,...,2.757563e-07,4.02652e-07,2.2e-05,9.8286e-07,0.0,0.010748,0.0,0.009473,0.010006,0
2,0,2,2,0,2,2,0.767592,0.721439,0.772834,0.777832,...,1.063529e-07,1.440847e-08,5e-06,6.626013e-05,3e-06,0.013097,0.00683,0.017065,0.016856,0
3,0,3,2,2,2,2,0.66622,0.810416,0.763971,0.785928,...,6.981461e-06,3.06558e-07,7e-06,1.199337e-06,0.0,0.009505,0.006274,0.020264,0.017981,0
4,0,4,2,2,2,2,0.774297,0.773366,0.772162,0.767017,...,3.076737e-06,1.723862e-08,6e-06,1.307199e-06,7e-06,0.004216,0.002132,0.023389,0.018477,0


In [None]:
# Save the fitted scaler for later use on test data
import pickle

# Save the scaler that was fitted on training data
with open('minmax_scaler.pkl', 'wb') as f:
    pickle.dump(minmax_scaler, f)

print("‚úÖ Scaler saved successfully!")
print(f"Scaler learned from training data - Min: {minmax_scaler.data_min_[:5]}")
print(f"Scaler learned from training data - Max: {minmax_scaler.data_max_[:5]}")

‚úÖ Scaler saved successfully!
Scaler learned from training data - Min: [0.         0.         0.00101504 0.00540321 0.        ]
Scaler learned from training data - Max: [1.407968  1.3346131 1.3060458 1.2547286 1.3592042]


In [None]:
target = pd.read_csv("pirate_pain_train_labels.csv")
target.head()

Unnamed: 0,sample_index,label
0,0,no_pain
1,1,no_pain
2,2,low_pain
3,3,no_pain
4,4,no_pain


In [None]:
# Define Weights
WEIGHTS = []
for label in np.unique(target['label']):
    print(f"Label: {label}, Count: {len(target[target['label'] == label])}")
    WEIGHTS.append(len(target) / len(target[target['label'] == label]))
WEIGHTS = torch.Tensor(WEIGHTS).to(device)

Label: high_pain, Count: 56
Label: low_pain, Count: 94
Label: no_pain, Count: 511


In [None]:
# Define a mapping of pain indexes to integer labels
label_mapping = {
    'no_pain': 0,
    'low_pain': 1,
    'high_pain': 2
}

# Map pain indexes to integers
target['label'] = target['label'].map(label_mapping)

## üîÑ **Data Preprocessing**

In [None]:
# Get unique user IDs and shuffle them
unique_users = df['sample_index'].unique()
random.seed(SEED) # Ensure reproducibility of shuffling
random.shuffle(unique_users)

input_shape = df.shape
num_classes = len(np.unique(target))

print(f"Input shape: {input_shape}")

Input shape: (105760, 37)


In [None]:
# Define a function to build sequences from the dataset
def build_sequences(df, window=200, stride=200):
    # Sanity check to ensure the window is divisible by the stride
    assert window % stride == 0

    # Initialise lists to store sequences and their corresponding labels
    dataset = []
    labels = []

    # Iterate over unique IDs in the DataFrame
    for id in df['sample_index'].unique():
        # Extract sensor data for the current ID
        temp = df[df['sample_index'] == id][data_cols].values

        # Retrieve the activity label for the current ID
        label = target[target['sample_index'] == id]['label'].values[0]

        # Calculate padding length to ensure full windows
        padding_len = window - len(temp) % window

        # Create zero padding and concatenate with the data
        padding = np.zeros((padding_len, len(data_cols)), dtype='float32')
        temp = np.concatenate((temp, padding))

        # Build feature windows and associate them with labels
        idx = 0
        while idx + window <= len(temp):
            dataset.append(temp[idx:idx + window])
            labels.append(label)
            idx += stride

    # Convert lists to numpy arrays for further processing
    dataset = np.array(dataset)
    labels = np.array(labels)

    return dataset, labels

def build_test_sequences(df, window=200, stride=200):
    # Sanity check to ensure the window is divisible by the stride
    assert window % stride == 0

    # Initialise lists to store sequences and their corresponding labels
    dataset = []

    # Iterate over unique IDs in the DataFrame
    for id in df['sample_index'].unique():
        # Extract sensor data for the current ID
        temp = df[df['sample_index'] == id][data_cols].values

        # Calculate padding length to ensure full windows
        padding_len = window - len(temp) % window

        # Create zero padding and concatenate with the data
        padding = np.zeros((padding_len, len(data_cols)), dtype='float32')
        temp = np.concatenate((temp, padding))

        # Build feature windows
        idx = 0
        while idx + window <= len(temp):
            dataset.append(temp[idx:idx + window])
            idx += stride

    # Convert lists to numpy arrays for further processing
    dataset = np.array(dataset)

    return dataset

joint diversi tra train e test
da 13 a 17 da 19 a 25

## üî¨ Valutazione strategie di sequenziamento (windowing) / Sequencing strategy evaluation

Questa sezione aggiunge funzioni per confrontare diversi schemi di finestratura (window/stride/labeling/padding) usando GroupKFold e macro‚ÄëF1.  
This section adds functions to compare different windowing schemes (window/stride/labeling/padding) with GroupKFold and macro‚ÄëF1.

> Nota/Note: le celle **non** vengono eseguite automaticamente. Esegui in ordine dall'alto verso il basso dopo aver caricato `df_train` e (se necessario) `target`.

In [None]:
# --- Imports & setup for the evaluation harness ---
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Reproducibility
np.random.seed(42)

# Label mapping (robust to string or numeric labels)
LABEL_MAP = {"no_pain": 0, "low_pain": 1, "high_pain": 2}

def _detect_joint_cols(df):
    return sorted([c for c in df.columns if c.startswith("joint_")])

def _get_data_cols(df):
    cols = _detect_joint_cols(df)
    if not cols:
        raise ValueError("Nessuna colonna 'joint_*' trovata in df. / No 'joint_*' columns found in df.")
    return cols

# Load labels if not already present
if "target" not in globals():
    try:
        target = pd.read_csv("pirate_pain_train_labels.csv")
    except FileNotFoundError:
        print("Avviso/Warning: 'target' non definito e file 'pirate_pain_train_labels.csv' non trovato.")
    else:
        if "label" in target.columns:
            # Map strings to ints if needed
            if target["label"].dtype == object:
                target["label"] = target["label"].map(lambda x: LABEL_MAP.get(x, x))

In [None]:
# --- Window builder ---
def build_windows(
    df: pd.DataFrame,
    target: pd.DataFrame,
    window: int = 300,
    stride: int = 75,
    labeling: str = "id",       # currently supports: 'id'
    padding: str = "zero",      # 'zero' or 'drop_last'
    feature: str = "flatten",   # 'flatten' (simple baseline)
    data_cols: list | None = None,
):
    """
    Costruisce finestre scorrevoli a partire da df e restituisce (X, y, groups).
    Builds sliding windows from df and returns (X, y, groups).
    """
    if data_cols is None:
        data_cols = _get_data_cols(df)
    X, y, groups = [], [], []
    for sid in df["sample_index"].unique():
        temp = df[df["sample_index"] == sid][data_cols].values
        # get label for this id
        lab_arr = target[target["sample_index"] == sid]["label"].values
        if len(lab_arr) == 0:
            # if missing label, skip this id
            continue
        lab = lab_arr[0]
        if isinstance(lab, str):
            lab = LABEL_MAP.get(lab, lab)
        # padding computation
        pad = (window - (len(temp) % window)) % window
        if padding == "zero" and pad:
            temp = np.concatenate([temp, np.zeros((pad, temp.shape[1]), dtype=temp.dtype)], axis=0)
        L = len(temp)
        start = 0
        while start + window <= L:
            seg = temp[start:start + window]
            if feature == "flatten":
                feat = seg.reshape(-1)
            else:
                feat = seg.reshape(-1)  # default fallback
            X.append(feat)
            y.append(lab)
            groups.append(sid)
            start += stride
    if not X:
        raise ValueError("Nessuna finestra generata: controlla window/stride e la presenza di colonne joint_.")
    return np.asarray(X), np.asarray(y), np.asarray(groups)

In [None]:
# --- Strategy evaluator ---
def eval_strategy(df: pd.DataFrame, target: pd.DataFrame, params: dict, n_splits: int = 5):
    X, y, groups = build_windows(df, target, **params)
    gkf = GroupKFold(n_splits=n_splits)
    scores = []
    for tr, te in gkf.split(X, y, groups):
        clf = LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=None)
        clf.fit(X[tr], y[tr])
        pred = clf.predict(X[te])
        scores.append(f1_score(y[te], pred, average="macro"))
    return np.array(scores)

In [None]:
# --- Example grid & runner (not executed automatically) ---
# Ensure df_train exists before running this cell.
try:
    _ = df_train
except NameError:
    print("Definisci/Load 'df_train' prima di eseguire questa cella. / Please define 'df_train' first.")
else:
    grid = [
        {"window": 128, "stride": 32,  "labeling": "id", "padding": "zero"},
        {"window": 256, "stride": 64,  "labeling": "id", "padding": "zero"},
        {"window": 300, "stride": 75,  "labeling": "id", "padding": "drop_last"},
    ]

    results = {}
    for p in grid:
        try:
            scores = eval_strategy(df_train, target, p, n_splits=5)
            results[str(p)] = (scores.mean(), scores.std(), scores.tolist())
        except Exception as e:
            results[str(p)] = f"Errore/Error: {e}"

    print("Risultati / Results (mean, std, fold_scores):")
    for name, res in results.items():
        print(name, "->", res)

Risultati / Results (mean, std, fold_scores):
{'window': 128, 'stride': 32, 'labeling': 'id', 'padding': 'zero'} -> (np.float64(0.535554626658966), np.float64(0.030318517367946034), [0.5575568481135713, 0.4784556964586573, 0.5640875456957871, 0.5360074084401983, 0.5416656345866163])
{'window': 256, 'stride': 64, 'labeling': 'id', 'padding': 'zero'} -> (np.float64(0.5744914366114311), np.float64(0.05140432771327308), [0.6315323565323566, 0.528096416254311, 0.6086213303604607, 0.4990065786568944, 0.6052005012531328])
{'window': 300, 'stride': 75, 'labeling': 'id', 'padding': 'drop_last'} -> Errore/Error: Nessuna finestra generata: controlla window/stride e la presenza di colonne joint_.


{'window': 256, 'stride': 64, 'labeling': 'id', 'padding': 'zero'} -> (np.float64(0.5744914366114311), np.float64(0.05140432771327308), [0.6315323565323566, 0.528096416254311, 0.6086213303604607, 0.4990065786568944, 0.6052005012531328])

Questo √® il milgiore

## üß≠ 2D Grid Sweep: `window` √ó `stride` (step 10, 10‚Üí600) ‚Äî due conti separati per `padding`

Questa cella valuta **tutte** le combinazioni di `window` e `stride` nell'intervallo 10..600 (passo 10),
eseguendo **due conti separati**: `padding="zero"` e `padding="drop_last"`.

> Eseguila **dopo** aver definito `df_train`, `target`, `build_windows` ed `eval_strategy`.
> I risultati vengono salvati come CSV: `grid_window_stride_results_padding_zero.csv`, `grid_window_stride_results_padding_drop_last.csv`.

In [None]:
# ===============================
# 2D GRID SWEEP: window & stride
# ===============================
import numpy as np
import pandas as pd

# ---- Configurazione griglia ----
WINDOWS = list(range(10, 601, 10))   # 10..600 step 10 (0 escluso perch√© invalido)
STRIDES = list(range(10, 601, 10))
PADDING_RUNS = ["zero", "drop_last"]  # due conti separati
N_SPLITS_MAX = 5                      # max fold per GroupKFold

# Se vuoi forzare le feature, imposta DATA_COLS (altrimenti usa quelle auto-rilevate in build_windows)
# DATA_COLS = ['has_prosthetics'] + [c for c in df_train.columns if c.startswith('joint_')]
DATA_COLS = None

def _effective_splits(df, n_splits_max=N_SPLITS_MAX):
    n_groups = int(df["sample_index"].nunique())
    return max(2, min(n_splits_max, n_groups))

def run_grid(df, target, padding="zero", data_cols=DATA_COLS, n_splits_max=N_SPLITS_MAX):
    """Valuta tutte le combinazioni (window, stride) ‚àà WINDOWS√óSTRIDES con il padding richiesto.
    Ritorna un DataFrame con mean_macroF1/std per ciascuna coppia."""
    if "eval_strategy" not in globals():
        raise RuntimeError("Serve eval_strategy()/build_windows(). Esegui prima le celle dell'harness.")
    rows = []
    n_splits = _effective_splits(df, n_splits_max)
    for w in WINDOWS:
        for s in STRIDES:
            params = {"window": w, "stride": s, "labeling": "id", "padding": padding}
            if data_cols is not None:
                params["data_cols"] = data_cols
            try:
                scores = eval_strategy(df, target, params, n_splits=n_splits)
                rows.append({"window": w, "stride": s, "padding": padding,
                             "mean_macroF1": float(scores.mean()), "std": float(scores.std())})
            except Exception as e:
                rows.append({"window": w, "stride": s, "padding": padding,
                             "mean_macroF1": np.nan, "std": np.nan, "error": str(e)})
    return pd.DataFrame(rows)

# ---- Pre-flight check ----
needed = ["df_train", "target", "build_windows", "eval_strategy"]
missing = [n for n in needed if n not in globals()]
if missing:
    print("‚ö†Ô∏è Mancano variabili/funzioni:", missing)
    print("Esegui le celle che definiscono df_train/target/build_windows/eval_strategy e riprova.")
else:
    # ---- Esecuzione dei due conti separati (padding diverso) ----
    all_results = []
    for pad in PADDING_RUNS:
        print(f"\n>>> Running 2D grid with padding = {pad}")
        df_res = run_grid(df_train, target, padding=pad, data_cols=DATA_COLS, n_splits_max=N_SPLITS_MAX)
        all_results.append(df_res)
        out_csv = f"grid_window_stride_results_padding_{pad}.csv"
        df_res.to_csv(out_csv, index=False)
        # Mostra le migliori 15 combinazioni
        try:
            top = (df_res.dropna(subset=["mean_macroF1"])  # noqa
                          .sort_values("mean_macroF1", ascending=False)
                          .head(15))
            print("Top-15:") 
            print(top.to_string(index=False))
        except Exception:
            pass

    # ---- Unione e best complessivo ----
    results = pd.concat(all_results, ignore_index=True)
    print("\n=== BEST OVERALL (top 20 su entrambi i padding) ===")
    print(results.dropna(subset=["mean_macroF1"])  # noqa
                 .sort_values("mean_macroF1", ascending=False)
                 .head(20)
                 .to_string(index=False))


>>> Running 2D grid with padding = zero


KeyboardInterrupt: 