# Pre-Processing Pipeline for Character Trajectories Dataset  

## Overview

This notebook contains the code for importing the raw CharacterTrajectories dataset and preparing it for use with a torchcde, LSTM, and RNN implementation.

In [68]:
import os
from sktime.datasets import load_from_tsfile_to_dataframe
import numpy as np
import torch

# Load the data
train_X, _ = load_from_tsfile_to_dataframe('../../data/raw//char_traj/CharacterTrajectories_TRAIN.ts')
test_X, _ = load_from_tsfile_to_dataframe('../../data/raw/char_traj/CharacterTrajectories_TEST.ts')

# Convert to NumPy arrays
train_X = train_X.to_numpy()
test_X = test_X.to_numpy()

# Concatenate the data to form a single dataset
X = np.concatenate((train_X, test_X), axis=0)   # (batch, channel)

In [69]:
def _pad(channel, maxlen):
    channel = torch.tensor(channel)
    mean_value = torch.mean(channel)
    out = torch.full((maxlen,), mean_value, dtype=channel.dtype)
    out[:channel.size(0)] = channel
    return out

# Pad all data to same size with zeros. To be corrected for torchcde
lengths = torch.tensor([len(Xi[0]) for Xi in X])
maxlen = lengths.max()

X = torch.stack([torch.stack([_pad(channel, maxlen) for channel in batch], dim=0) for batch in X], dim=0)
X = X.transpose(-1, -2)

In [70]:
# Normalize the data
mean = X.mean(dim=1, keepdim=True)
std = X.std(dim=1, keepdim=True)
X = (X - mean) / std

# Replace near zero values with zero
X[torch.abs(X) < 1e-5] = 0

In [71]:
# Add time as a channel
times = [np.linspace(0, lengths[i] - 1, lengths[i]) for i in range(len(X))]
for i, time in enumerate(times):
    padding = np.zeros(maxlen - len(time))
    times[i] = np.concatenate((time, padding))

times = torch.tensor(times)

X = torch.cat((times.unsqueeze(-1), X), dim=-1)

In [72]:
print(X[0])

tensor([[ 0.0000e+00, -6.0273e-01,  6.1764e-03,  1.5548e+00],
        [ 1.0000e+00, -6.8498e-01,  2.5516e-02,  1.9182e+00],
        [ 2.0000e+00, -7.6415e-01,  2.7903e-02,  2.1482e+00],
        [ 3.0000e+00, -8.3531e-01,  1.6717e-02,  2.2173e+00],
        [ 4.0000e+00, -9.0040e-01, -2.9051e-04,  2.1530e+00],
        [ 5.0000e+00, -9.6552e-01, -1.7984e-02,  2.0091e+00],
        [ 6.0000e+00, -1.0365e+00, -3.6704e-02,  1.8351e+00],
        [ 7.0000e+00, -1.1162e+00, -6.0383e-02,  1.6671e+00],
        [ 8.0000e+00, -1.2042e+00, -9.3796e-02,  1.5287e+00],
        [ 9.0000e+00, -1.2988e+00, -1.4115e-01,  1.4239e+00],
        [ 1.0000e+01, -1.3981e+00, -2.0527e-01,  1.3334e+00],
        [ 1.1000e+01, -1.5006e+00, -2.8706e-01,  1.2327e+00],
        [ 1.2000e+01, -1.6057e+00, -3.8626e-01,  1.1181e+00],
        [ 1.3000e+01, -1.7124e+00, -5.0288e-01,  1.0051e+00],
        [ 1.4000e+01, -1.8189e+00, -6.3724e-01,  9.0842e-01],
        [ 1.5000e+01, -1.9206e+00, -7.8799e-01,  8.3640e-01],
        

In [37]:
def create_labels(X, lengths):
    '''
    Create labels for the data. The data is the first
    2/3 of each sequence and the labels are the last 1/3.
    Sequences are padded with zeros to be the same length.

    Parameters
    ----------
    X : torch.Tensor
        The data of shape (batch, seq_length, channels).
    lengths : torch.Tensor
        The lengths of the sequences.

    Returns
    -------
    data : torch.Tensor
        The data of shape (batch, seq_length, channels).
    labels : torch.Tensor
        The labels of shape (batch, seq_length, channels).
    '''
    data = torch.zeros(X.size(0), X.size(1), X.size(2))
    labels = torch.zeros(X.size(0), X.size(1), X.size(2))

    for i in range(X.size(0)):
        # Pad datapoint to size 182 with zeros
        datapoint = X[i, :lengths[i] * 2 // 3]
        padding = torch.zeros(X.size(1) - len(datapoint), X.size(2))
        datapoint = torch.cat((datapoint, padding))

        # Pad label to size 182 with zeros
        label = X[i, lengths[i] * 2 // 3:]
        padding = torch.zeros(X.size(1) - len(label), X.size(2))
        label = torch.cat((label, padding))
        
        data[i] = datapoint
        labels[i] = label

    return data, labels

data, labels = create_labels(X, lengths)

In [428]:
# Split the data into a 80-20 train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Save the data
path = '../../data/processed/CharacterTrajectories/classification'
os.makedirs(path, exist_ok=True)
torch.save(X_train, os.path.join(path, 'X_train.pt'))
torch.save(y_train, os.path.join(path, 'y_train.pt'))
torch.save(X_test, os.path.join(path, 'X_test.pt'))
torch.save(y_test, os.path.join(path, 'y_test.pt'))

In [373]:
def insert_random_missingness(X, y, missing_rate):
    '''
    Insert random missingness into the data.
    Only the first 120 sequence elements will be affected (avg length of 120).

    Parameters
    ----------
    X : np.array
        The input data. Shape (n_samples, sequence_length, n_features)
    y : np.array
        The target data. Shape (n_samples,)
    missing_rate : float
        The rate of missingness to insert in sequences of the samples. 
        Each sample in X will have this proportion of its sequence (each feature)
        set to NaN. 
    '''
    n_samples, _, _ = X.shape

    # Generate a mask of missingness 
    mask = np.random.rand(n_samples, 120) < missing_rate

    print(mask.shape)

    # Set the missing values to NaN
    X[mask] = np.nan

insert_random_missingness(X, y, 0.1)

(2858, 120)


IndexError: The shape of the mask [2858, 120] at index 1 does not match the shape of the indexed tensor [2858, 182, 4] at index 1