## ‚öôÔ∏è **Libraries Import**

In [1]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
from torch.utils.data import DataLoader

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

PyTorch version: 2.8.0+cpu
Device: cpu


## ‚è≥ **Data Loading**

In [2]:
df = pd.read_csv("pirate_pain_train.csv")
df_test = pd.read_csv("pirate_pain_test.csv")
df = df.drop(columns=['joint_30'])
df_test = df_test.drop(columns=['joint_30'])

print("Training data shape:", df.shape)

Training data shape: (105760, 39)


In [3]:
# Create binary 'has_prosthetics' feature (0 = all natural, 1 = has prosthetics)
print("Creating consolidated feature: 'has_prosthetics'")
print("=" * 60)

# Create the new feature
df['has_prosthetics'] = (df['n_legs'] != 'two').astype(int)
df_test['has_prosthetics'] = (df_test['n_legs'] != 'two').astype(int)

# Show the mapping
print("\nMapping:")
print("  has_prosthetics = 0 ‚Üí All natural body parts (two legs, two hands, two eyes)")
print("  has_prosthetics = 1 ‚Üí Has prosthetics (peg leg, hook hand, eye patch)")

# Show distribution
print("\n" + "=" * 60)
print("Distribution of new feature:")
print("=" * 60)
print("\nTraining set:")
train_dist = df['has_prosthetics'].value_counts().sort_index()
for value, count in train_dist.items():
    label = "Natural" if value == 0 else "Prosthetics"
    pct = (count / len(df)) * 100
    print(f"  {value} ({label:12s}): {count:6,} samples ({pct:.2f}%)")

print("\nTest set:")
test_dist = df_test['has_prosthetics'].value_counts().sort_index()
for value, count in test_dist.items():
    label = "Natural" if value == 0 else "Prosthetics"
    pct = (count / len(df_test)) * 100
    print(f"  {value} ({label:12s}): {count:6,} samples ({pct:.2f}%)")


# Columns to drop
cols_to_drop = ['n_legs', 'n_hands', 'n_eyes', 
                'n_legs_encoded', 'n_hands_encoded', 'n_eyes_encoded']

# Drop from both train and test
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
df_test = df_test.drop(columns=[col for col in cols_to_drop if col in df_test.columns])

print("\nFeature created successfully!")

Creating consolidated feature: 'has_prosthetics'

Mapping:
  has_prosthetics = 0 ‚Üí All natural body parts (two legs, two hands, two eyes)
  has_prosthetics = 1 ‚Üí Has prosthetics (peg leg, hook hand, eye patch)

Distribution of new feature:

Training set:
  0 (Natural     ): 104,800 samples (99.09%)
  1 (Prosthetics ):    960 samples (0.91%)

Test set:
  0 (Natural     ): 209,760 samples (99.02%)
  1 (Prosthetics ):  2,080 samples (0.98%)

Feature created successfully!


In [4]:
from sklearn.preprocessing import MinMaxScaler

# List of joint columns to normalize
joint_cols = ["joint_" + str(i).zfill(2) for i in range(30)]

for col in joint_cols:
  df[col] = df[col].astype(np.float32)

# Initialize the MinMaxScaler
minmax_scaler = MinMaxScaler()

# Apply Min-Max normalization to the joint columns
df[joint_cols] = minmax_scaler.fit_transform(df[joint_cols])

data_cols = ['has_prosthetics'] + joint_cols

display(df.head())

Unnamed: 0,sample_index,time,pain_survey_1,pain_survey_2,pain_survey_3,pain_survey_4,joint_00,joint_01,joint_02,joint_03,...,joint_21,joint_22,joint_23,joint_24,joint_25,joint_26,joint_27,joint_28,joint_29,has_prosthetics
0,0,0,2,0,2,1,0.777507,0.738252,0.779512,0.804419,...,2.426544e-06,1.374706e-06,1.5e-05,0.0003162813,4e-06,0.014214,0.011376,0.018978,0.020291,0
1,0,1,2,2,2,2,0.806256,0.765147,0.761153,0.838021,...,2.757563e-07,4.02652e-07,2.2e-05,9.8286e-07,0.0,0.010748,0.0,0.009473,0.010006,0
2,0,2,2,0,2,2,0.767592,0.721439,0.772834,0.777832,...,1.063529e-07,1.440847e-08,5e-06,6.626013e-05,3e-06,0.013097,0.00683,0.017065,0.016856,0
3,0,3,2,2,2,2,0.66622,0.810416,0.763971,0.785928,...,6.981461e-06,3.06558e-07,7e-06,1.199337e-06,0.0,0.009505,0.006274,0.020264,0.017981,0
4,0,4,2,2,2,2,0.774297,0.773366,0.772162,0.767017,...,3.076737e-06,1.723862e-08,6e-06,1.307199e-06,7e-06,0.004216,0.002132,0.023389,0.018477,0


In [5]:
# Save the fitted scaler for later use on test data
import pickle

# Save the scaler that was fitted on training data
with open('minmax_scaler.pkl', 'wb') as f:
    pickle.dump(minmax_scaler, f)

print("‚úÖ Scaler saved successfully!")
print(f"Scaler learned from training data - Min: {minmax_scaler.data_min_[:5]}")
print(f"Scaler learned from training data - Max: {minmax_scaler.data_max_[:5]}")

‚úÖ Scaler saved successfully!
Scaler learned from training data - Min: [0.         0.         0.00101504 0.00540321 0.        ]
Scaler learned from training data - Max: [1.407968  1.3346131 1.3060458 1.2547286 1.3592042]


In [6]:
target = pd.read_csv("pirate_pain_train_labels.csv")
target.head()

Unnamed: 0,sample_index,label
0,0,no_pain
1,1,no_pain
2,2,low_pain
3,3,no_pain
4,4,no_pain


In [7]:
# Define Weights
WEIGHTS = []
for label in np.unique(target['label']):
    print(f"Label: {label}, Count: {len(target[target['label'] == label])}")
    WEIGHTS.append(len(target) / len(target[target['label'] == label]))
WEIGHTS = torch.Tensor(WEIGHTS).to(device)

Label: high_pain, Count: 56
Label: low_pain, Count: 94
Label: no_pain, Count: 511


In [8]:
# Define a mapping of pain indexes to integer labels
label_mapping = {
    'no_pain': 0,
    'low_pain': 1,
    'high_pain': 2
}

# Map pain indexes to integers
target['label'] = target['label'].map(label_mapping)

## üîÑ **Data Preprocessing**

In [9]:
# Get unique user IDs and shuffle them
unique_users = df['sample_index'].unique()
random.seed(SEED) # Ensure reproducibility of shuffling
random.shuffle(unique_users)

input_shape = df.shape
num_classes = len(np.unique(target))

print(f"Input shape: {input_shape}")

Input shape: (105760, 37)


In [10]:
# # Define a function to build sequences from the dataset
# def build_sequences(df, window=200, stride=200):
#     # Sanity check to ensure the window is divisible by the stride
#     assert window % stride == 0

#     # Initialise lists to store sequences and their corresponding labels
#     dataset = []
#     labels = []

#     # Iterate over unique IDs in the DataFrame
#     for id in df['sample_index'].unique():
#         # Extract sensor data for the current ID
#         temp = df[df['sample_index'] == id][data_cols].values

#         # Retrieve the activity label for the current ID
#         label = target[target['sample_index'] == id]['label'].values[0]

#         # Calculate padding length to ensure full windows
#         padding_len = window - len(temp) % window

#         # Create zero padding and concatenate with the data
#         padding = np.zeros((padding_len, len(data_cols)), dtype='float32')
#         temp = np.concatenate((temp, padding))

#         # Build feature windows and associate them with labels
#         idx = 0
#         while idx + window <= len(temp):
#             dataset.append(temp[idx:idx + window])
#             labels.append(label)
#             idx += stride

#     # Convert lists to numpy arrays for further processing
#     dataset = np.array(dataset)
#     labels = np.array(labels)

#     return dataset, labels

# def build_test_sequences(df, window=200, stride=200):
#     # Sanity check to ensure the window is divisible by the stride
#     assert window % stride == 0

#     # Initialise lists to store sequences and their corresponding labels
#     dataset = []

#     # Iterate over unique IDs in the DataFrame
#     for id in df['sample_index'].unique():
#         # Extract sensor data for the current ID
#         temp = df[df['sample_index'] == id][data_cols].values

#         # Calculate padding length to ensure full windows
#         padding_len = window - len(temp) % window

#         # Create zero padding and concatenate with the data
#         padding = np.zeros((padding_len, len(data_cols)), dtype='float32')
#         temp = np.concatenate((temp, padding))

#         # Build feature windows
#         idx = 0
#         while idx + window <= len(temp):
#             dataset.append(temp[idx:idx + window])
#             idx += stride

#     # Convert lists to numpy arrays for further processing
#     dataset = np.array(dataset)

#     return dataset

In [11]:
from sklearn.model_selection import train_test_split

# Split the unique users into train and validation sets (80/20 split)
train_users, val_users = train_test_split(unique_users, test_size=0.2, random_state=SEED)

# Create train and validation dataframes
df_train = df[df['sample_index'].isin(train_users)]
df_val = df[df['sample_index'].isin(val_users)]

# Create train and validation labels
target_train = target[target['sample_index'].isin(train_users)]
target_val = target[target['sample_index'].isin(val_users)]

# Save the splits to CSV files
df_train.to_csv('data/X_train.csv', index=False)
df_val.to_csv('data/X_val.csv', index=False)
target_train.to_csv('data/Y_train.csv', index=False)
target_val.to_csv('data/Y_val.csv', index=False)

joint diversi tra train e test
da 13 a 17 da 19 a 25

# Optional: Run grid sweep to find out best window/stride combination

In [12]:
# from sequencing_grid import run_sequencing_grid
# grid_results = run_sequencing_grid(df, target, data_cols=data_cols, n_splits_max=5, windows=[100, 200, 300], strides=[50, 100, 150])