# Load, inspect, and preprocess data (both classification & forecasting)

# Classification

## Initial Setup

In [22]:
import os
import torch
from torch.utils.data import TensorDataset, DataLoader

## Data Loading

Loading .pt files from data/classification/epilepsy and data/classification/sleepEEG

In [20]:
def load_pt(dataset_dir):
    '''
    Loads train, validation and test sets form the data/classification/ files
    Args: 
        dataset_dir (str): Path to dataset directory
    Returns: 
        dict: dictionary with keys 'train', 'val' and 'test', each mapping to 'data' and 'label' dictionary
    '''
    # print("Current Working Directory:", os.getcwd())
    data_dict = {}
    for split_file in ['train', 'val', 'test']:
        file_path = os.path.join(dataset_dir, f"{split_file}.pt")
        print("File Path Found: ", file_path)
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"{file_path} not found!")
        data_dict[split_file] = torch.load(file_path)
    return data_dict

epilepsy_dir = "../data/classification/epilepsy/"
sleepeeg_dir = "../data/classification/sleepEEG/"

def load_classification_dataset(epilepsy_dir, sleepeeg_dir):
    print("\nLoading the Epilepsy data set from: ", epilepsy_dir)
    epilepsy_dir = load_pt(epilepsy_dir)
    print("\nLoading the SleepEEG data set from: ", sleepeeg_dir)
    sleepeeg_dir = load_pt(sleepeeg_dir)
    print("\n")

load_classification_dataset(epilepsy_dir, sleepeeg_dir)


Loading the Epilepsy data set from:  ../data/classification/epilepsy/
File Path Found:  ../data/classification/epilepsy/train.pt
File Path Found:  ../data/classification/epilepsy/val.pt
File Path Found:  ../data/classification/epilepsy/test.pt

Loading the SleepEEG data set from:  ../data/classification/sleepEEG/
File Path Found:  ../data/classification/sleepEEG/train.pt
File Path Found:  ../data/classification/sleepEEG/val.pt
File Path Found:  ../data/classification/sleepEEG/test.pt




  data_dict[split_file] = torch.load(file_path)


### Processing Steps

1. Normalization

In [40]:
def normalize_data(X):
    '''
    Normalizes each sample in a batch of time serise data
    Args:
        X (torch.Tensor): Input tensor of shape (num_samples, sequence_length, channels)
    Returns:
        torch.Tensor: Normalized data tensor of same shape
    
    '''
    print("Normalizing the Data...")
    mean = X.mean(dim = 1, keepdim = True)
    std = X.std(dim = 1, keepdim = True)

    X_normalized = (X - mean) / (std + 1e-8)

    print(f"Data Normalized!:\n {X_normalized}, \nSample:" )
    print(X_normalized[0])

    return X_normalized

X = torch.randn(100, 10, 3)                               # num_samples=5, sequence_length=10, channels=3)
normalized_X = normalize_data(X)

Normalizing the Data...
Data Normalized!:
 tensor([[[-0.1485, -0.9279,  0.3021],
         [ 1.5872, -0.5958, -0.5852],
         [-0.5903,  0.8131,  0.4004],
         ...,
         [-0.4318,  0.9880,  1.0550],
         [ 0.8186,  0.0105,  0.2576],
         [-0.9088,  0.2840, -1.3327]],

        [[-1.5043, -1.3305, -0.1704],
         [-0.4109,  0.3547, -0.9528],
         [-0.4673,  2.0046, -0.7337],
         ...,
         [ 1.4402, -0.3421, -0.8243],
         [ 0.1561,  0.9028,  1.1769],
         [ 0.8439, -0.7454,  0.9178]],

        [[ 0.6569,  1.8598, -0.8483],
         [ 0.6120, -0.1908,  0.8278],
         [-0.2943,  1.0893,  0.9352],
         ...,
         [-0.3682, -1.8264, -0.6037],
         [-1.5373, -0.1024, -0.9526],
         [-0.3752, -0.3387, -1.0702]],

        ...,

        [[ 0.5969,  0.3656,  0.7539],
         [ 1.2546,  1.4022, -0.9900],
         [ 1.3200, -0.0459,  1.2897],
         ...,
         [-0.9968,  0.3001, -1.9298],
         [ 0.6790,  1.3201,  0.2767],
       

In [36]:
def dataloader(X, y, batch_size = 32, shuffle = True):
    '''
    Creates a DataLoader from input data and labels to help train on smaller batches
    Args: 
        X(torch.Tensor): Data tensor of shape (num_samples, sequence_length, channels)
        y (torch.Tensor): Label rensor of shape (num_samples)
        batch_size (int): Batch size
        shuffle (bool): To shuffle the data
    
    '''
    print("Creating DataLoader...")
    assert X.size(0) == y.size(0), f"Size mismatch! X has {X.size(0)} samples, y has {y.size(0)} labels"
    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size = batch_size, shuffle = shuffle)
    print(f"Data Loader Created with {len(loader)} batches.")
    return loader
y = torch.randint(0, 3, (100,))
loader = dataloader(X, y, batch_size = 16)

Creating DataLoader...
Data Loader Created with 7 batches.


# Forecasting