# Pre-Processing Pipeline for Character Trajectories Dataset  

## Overview

This notebook contains the code for importing the raw CharacterTrajectories dataset and preparing it for use with a torchcde, LSTM, and RNN implementation.

In [1]:
import os
from sktime.datasets import load_from_tsfile_to_dataframe
import numpy as np
import torch

# Load the data
train_X, train_y = load_from_tsfile_to_dataframe('../../data/raw//char_traj/CharacterTrajectories_TRAIN.ts')
test_X, test_y = load_from_tsfile_to_dataframe('../../data/raw/char_traj/CharacterTrajectories_TEST.ts')

# Convert to NumPy arrays
train_X = train_X.to_numpy()
test_X = test_X.to_numpy()

# Concatenate the data to form a single dataset
X = np.concatenate((train_X, test_X), axis=0)   # (batch, channel)
y = np.concatenate((train_y, test_y), axis=0)

# Change the labels to start from 0 and be integers
classes = np.unique(y)
y = np.array([np.where(classes == yi)[0][0] for yi in y])

In [2]:
def _pad(channel, maxlen):
    channel = torch.tensor(channel)
    mean_value = torch.mean(channel)
    out = torch.full((maxlen,), mean_value, dtype=channel.dtype)
    out[:channel.size(0)] = channel
    return out

# Pad all data to same size with zeros. To be corrected for torchcde
lengths = torch.tensor([len(Xi[0]) for Xi in X])
maxlen = lengths.max()

X = torch.stack([torch.stack([_pad(channel, maxlen) for channel in batch], dim=0) for batch in X], dim=0)
X = X.transpose(-1, -2)

In [3]:
# Normalize the data
mean = X.mean(dim=1, keepdim=True)
std = X.std(dim=1, keepdim=True)
X = (X - mean) / std

# Replace near zero values with zero
X[torch.abs(X) < 1e-5] = 0

In [4]:
# Add time as a channel
times = [np.linspace(0, lengths[i] - 1, lengths[i]) for i in range(len(X))]
for i, time in enumerate(times):
    padding = np.zeros(maxlen - len(time))
    times[i] = np.concatenate((time, padding))

times = torch.tensor(times)

X = torch.cat((times.unsqueeze(-1), X), dim=-1)

  times = torch.tensor(times)


In [5]:
def create_labels(X, lengths):
    '''
    Create labels for the data. The data is the first
    2/3 of each sequence and the labels are the last 1/3.
    Sequences are padded with zeros to be the same length.

    Parameters
    ----------
    X : torch.Tensor
        The data of shape (batch, seq_length, channels).
    lengths : torch.Tensor
        The lengths of the sequences.

    Returns
    -------
    data : torch.Tensor
        The data of shape (batch, seq_length, channels).
    labels : torch.Tensor
        The labels of shape (batch, seq_length, channels).
    '''
    data = torch.zeros(X.size(0), X.size(1) * 2 // 3, X.size(2))
    labels = torch.zeros(X.size(0), X.size(1) * 1 // 3 + 1, X.size(2))

    for i in range(X.size(0)):
        # Pad datapoint with zeros
        datapoint = X[i, :lengths[i] * 2 // 3]
        padding = torch.zeros(X.size(1) * 2 // 3 - len(datapoint), X.size(2))
        datapoint = torch.cat((datapoint, padding))

        # Pad label with zeros
        label = X[i, lengths[i] * 2 // 3:lengths[i]]
        padding = torch.zeros(X.size(1) * 1 // 3 - len(label) + 1, X.size(2))
        label = torch.cat((label, padding))
        
        data[i] = datapoint
        labels[i] = label

    return data, labels

data, labels = create_labels(X, lengths)

In [6]:
# Split the data into a 80-20 train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=y)

# Save the data
path = '../../data/processed/CharacterTrajectories/regression/'
os.makedirs(path, exist_ok=True)
torch.save(X_train, os.path.join(path, 'X_train.pt'))
torch.save(y_train, os.path.join(path, 'y_train.pt'))
torch.save(X_test, os.path.join(path, 'X_test.pt'))
torch.save(y_test, os.path.join(path, 'y_test.pt'))

In [7]:
def insert_random_missingness(X, y, missing_rate):
    '''
    Insert random missingness into the data.
    Only the first 120 sequence elements will be affected (avg length of 120).

    Parameters
    ----------
    X : np.array
        The input data. Shape (n_samples, sequence_length, n_features)
    y : np.array
        The target data. Shape (n_samples,)
    missing_rate : float
        The rate of missingness to insert in sequences of the samples. 
        Each sample in X will have this proportion of its sequence (each feature)
        set to NaN. 
    '''
    n_samples, _, _ = X.shape

    # Generate a mask of missingness 
    mask = np.random.rand(n_samples, 120) < missing_rate

    print(mask.shape)

    # Set the missing values to NaN
    X[mask] = np.nan

insert_random_missingness(X, y, 0.1)

(2858, 120)


IndexError: The shape of the mask [2858, 120] at index 1 does not match the shape of the indexed tensor [2858, 182, 4] at index 1