# Dataset Preprocessor Validation

This notebook verifies that the modified preprocessing pipeline with PyTorch Dataset support creates the correct data structures and handles batching properly with seed support.

## Key Features to Verify:
- Time series data shape `(R, l, N)` where R=sequences, l=length, N=variables
- Seed-based reproducible shuffling
- Proper PyTorch Dataset implementation
- Efficient DataLoader batching
- Dynamic seed changing


In [1]:
import sys
import os
import numpy as np
import torch
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

print("Project root added to sys.path:", project_root)

from src.utils.preprocessing_utils import (
    TimeSeriesDataset, 
    create_dataloaders,
    preprocess_data
)

from src.utils.configs_utils import get_dataset_cfgs

print("All imports successful!")


Project root added to sys.path: C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main
All imports successful!


## Example 1: Basic Preprocessing with Seed Support

### Testing: 
- GOOG: `data/GOOG/GOOG.csv`

In [2]:
def print_data_shapes(train_data, valid_data, shape_labels):
    print(f"Data shapes:")
    print(f"  Train shape: {train_data.shape} {shape_labels['train']}")
    print(f"  Valid shape: {valid_data.shape} {shape_labels['valid']}")
    if 'sequence_length' in shape_labels:
        print(f"  Sequence length (l): {shape_labels['sequence_length']}")
    if 'num_channels' in shape_labels:
        print(f"  Number of channels (N): {shape_labels['num_channels']}")
    if 'total_samples' in shape_labels:
        print(f"  Total time-series samples (R): {shape_labels['total_samples']}")

def print_channel_statistics(train_data, valid_data, num_channels):
    print(f"\nLog return statistics:")
    for channel in range(num_channels):
        # Handle both 2D and 3D data
        if train_data.ndim == 3:
            train_min = train_data[:, :, channel].min()
            train_max = train_data[:, :, channel].max()
            train_mean = train_data[:, :, channel].mean()
        else:
            train_min = train_data[:, channel].min()
            train_max = train_data[:, channel].max()
            train_mean = train_data[:, channel].mean()

        if valid_data.ndim == 3:
            valid_min = valid_data[:, :, channel].min()
            valid_max = valid_data[:, :, channel].max()
            valid_mean = valid_data[:, :, channel].mean()
        else:
            valid_min = valid_data[:, channel].min()
            valid_max = valid_data[:, channel].max()
            valid_mean = valid_data[:, channel].mean()

        print(f"  Channel {channel}:")
        print(f"    Train log-return range: [{train_min:.4f}, {train_max:.4f}]")
        print(f"    Valid log-return range: [{valid_min:.4f}, {valid_max:.4f}]")
        print(f"    Train log-return mean: {train_mean:.4f}")
        print(f"    Valid log-return mean: {valid_mean:.4f}")

def test_non_parametric_preprocessing(train_data, valid_data):
    """
    Run Example 1.1: Preprocessing for Non-Parametric models
    """
    if train_data is not None and valid_data is not None:
        print(f"\nPreprocessing successful!")
        shape_labels = {
            'train': "(R_train, l, N)",
            'valid': "(R_valid, l, N)",
            'sequence_length': train_data.shape[1],
            'num_channels': train_data.shape[2],
            'total_samples': train_data.shape[0] + valid_data.shape[0],
        }
        print_data_shapes(train_data, valid_data, shape_labels)
        print_channel_statistics(train_data, valid_data, train_data.shape[2])
    else:
        print("Preprocessing failed...")

def test_parametric_preprocessing(train_data, valid_data):
    """
    Run Example 1.2: Preprocessing for Parametric models
    """
    if train_data is not None and valid_data is not None:
        print(f"\nPreprocessing successful!")
        shape_labels = {
            'train': "(l, N)",
            'valid': "(l, N)",
            'sequence_length': train_data.shape[0],
            'num_channels': train_data.shape[1],
        }
        print_data_shapes(train_data, valid_data, shape_labels)
        print_channel_statistics(train_data, valid_data, train_data.shape[1])
    else:
        print("Preprocessing failed...")

In [3]:
print("=" * 60)
print("EXAMPLE 1: Preprocessing for both parametric and non-parametric models")
print("=" * 60)

nonparametric_dataset_cfgs, parametric_dataset_cfgs = get_dataset_cfgs()

print(f"Configuration Dataset: {nonparametric_dataset_cfgs}")
print("\nStarting preprocessing for non-parametric...")
train_data_np, valid_data_np, _ = preprocess_data(nonparametric_dataset_cfgs)

test_non_parametric_preprocessing(train_data_np, valid_data_np)

print(f"Configuration Dataset: {parametric_dataset_cfgs}")
print("\nStarting preprocessing for parametric...")
train_data_para, valid_data_para, _ = preprocess_data(parametric_dataset_cfgs)

test_parametric_preprocessing(train_data_para, valid_data_para)

EXAMPLE 1: Preprocessing for both parametric and non-parametric models
Configuration Dataset: {'ticker': 'AAPL', 'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\AAPL\\AAPL.csv', 'valid_ratio': 0.1, 'test_ratio': 0.1, 'do_transformation': True, 'seed': 42}

Starting preprocessing for non-parametric...
Preprocessing data for AAPL

Preprocessing successful!
Data shapes:
  Train shape: (9035, 13, 4) (R_train, l, N)
  Valid shape: (1129, 13, 4) (R_valid, l, N)
  Sequence length (l): 13
  Number of channels (N): 4
  Total time-series samples (R): 10164

Log return statistics:
  Channel 0:
    Train log-return range: [-0.5593, 0.2362]
    Valid log-return range: [-0.0962, 0.0928]
    Train log-return mean: 0.0008
    Valid log-return mean: 0.0013
  Channel 1:
    Train log-return range: [-0.6182, 0.3275]
    Valid log-return range: [-0.1018, 0.0999]
    Train log-return mean: 0.0008
    Valid log-return mean: 0.0014
  Channel 2:
    Train log-

## Example 2: PyTorch Dataset and DataLoader Creation

Now let's create PyTorch datasets and dataloaders to verify proper batching and seed support.


In [4]:
print("\n" + "=" * 60)
print("EXAMPLE 2: PyTorch Dataset and DataLoader Creation")
print("=" * 60)

print("Creating TimeSeriesDataset objects...")
train_dataset = TimeSeriesDataset(train_data_np, seed=42)
valid_dataset = TimeSeriesDataset(valid_data_np, seed=42)

print(f"Created datasets:")
print(f"  Train dataset length: {len(train_dataset)}")
print(f"  Valid dataset length: {len(valid_dataset)}")
print(f"  Sample shape: {train_dataset[0].shape}")
print(f"  Sample dtype: {train_dataset[0].dtype}")

sample = train_dataset[0]
if isinstance(sample, torch.Tensor):
    print(f"  Sample is PyTorch tensor: {type(sample)}")
else:
    print(f"  Sample is not PyTorch tensor: {type(sample)}")

print(f"\nCreating DataLoaders...")
batch_size = 32
train_loader, valid_loader = create_dataloaders(
    train_data_np, valid_data_np,
    batch_size=batch_size,
    train_seed=42,
    valid_seed=42,
    num_workers=0,
    pin_memory=False
)

print(f"Created dataloaders:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Valid batches: {len(valid_loader)}")
print(f"  Batch size: {batch_size}")

print(f"\nBatch Information:")
for i, batch in enumerate(train_loader):
    print(f"Batch {i+1}: shape {batch.shape}, dtype {batch.dtype}")
    feature_channels = batch[:, :, 1:]
    if feature_channels.shape[-1] > 0:
        ranges = []
        for ch in range(feature_channels.shape[-1]):
            ch_min = feature_channels[:, :, ch].min()
            ch_max = feature_channels[:, :, ch].max()
            ranges.append(f"[{ch_min:.4f}, {ch_max:.4f}]")
        print(f"  Per-feature value range (excluding time channel): {ranges}")
    else:
        print("  No feature channels (only time present)")
    if i >= 2:
        print(f"... and {len(train_loader) - 3} more batches")
        break

first_batch = next(iter(train_loader))
expected_shape = (batch_size, train_data_np.shape[1], train_data_np.shape[2])
if first_batch.shape == expected_shape:
    print(f"\nBatch shapes are correct: {first_batch.shape} == {expected_shape}")
else:
    print(f"\nBatch shape mismatch: {first_batch.shape} != {expected_shape}")



EXAMPLE 2: PyTorch Dataset and DataLoader Creation
Creating TimeSeriesDataset objects...
Created datasets:
  Train dataset length: 9035
  Valid dataset length: 1129
  Sample shape: torch.Size([13, 4])
  Sample dtype: torch.float32
  Sample is PyTorch tensor: <class 'torch.Tensor'>

Creating DataLoaders...
Created dataloaders:
  Train batches: 283
  Valid batches: 36
  Batch size: 32

Batch Information:
Batch 1: shape torch.Size([32, 13, 4]), dtype torch.float32
  Per-feature value range (excluding time channel): ['[-0.1569, 0.1049]', '[-0.1686, 0.1523]', '[-0.1630, 0.1111]']
Batch 2: shape torch.Size([32, 13, 4]), dtype torch.float32
  Per-feature value range (excluding time channel): ['[-0.0864, 0.1217]', '[-0.1084, 0.1221]', '[-0.1073, 0.0952]']
Batch 3: shape torch.Size([32, 13, 4]), dtype torch.float32
  Per-feature value range (excluding time channel): ['[-0.0880, 0.0771]', '[-0.1667, 0.0830]', '[-0.0881, 0.0942]']
... and 280 more batches

Batch shapes are correct: torch.Size([3

## Example 3: Reproducible Training with Seed Control

Let's verify that seeds produce reproducible and different shuffling patterns.


In [5]:
print("\n" + "=" * 60)
print("EXAMPLE 3: Reproducible Training with Seed Control")
print("=" * 60)

print("Testing reproducibility with same seeds...")
dataset1 = TimeSeriesDataset(train_data_np, seed=42)
dataset2 = TimeSeriesDataset(train_data_np, seed=42)

indices1 = dataset1.get_original_indices()
indices2 = dataset2.get_original_indices()

print(f"Datasets with same seed produce identical order: {indices1[:10] == indices2[:10]}")
print(f"  First 10 indices (dataset1): {indices1[:10]}")
print(f"  First 10 indices (dataset2): {indices2[:10]}")

print(f"\nTesting different seeds produce different orders...")
dataset3 = TimeSeriesDataset(train_data_np, seed=123)
indices3 = dataset3.get_original_indices()

print(f"Datasets with different seeds produce different order: {indices1[:10] != indices3[:10]}")
print(f"  First 10 indices (seed=42): {indices1[:10]}")
print(f"  First 10 indices (seed=123): {indices3[:10]}")

print(f"\nTesting dynamic seed changing...")
original_indices = dataset1.get_original_indices()[:10]
dataset1.set_seed(999)
new_indices = dataset1.get_original_indices()[:10]

print(f"Seed change produces different order: {original_indices != new_indices}")
print(f"  Original (seed=42): {original_indices}")
print(f"  New (seed=999):     {new_indices}")



EXAMPLE 3: Reproducible Training with Seed Control
Testing reproducibility with same seeds...
Datasets with same seed produce identical order: True
  First 10 indices (dataset1): [3522, 2086, 6549, 1241, 5426, 7986, 2035, 2005, 5946, 1966]
  First 10 indices (dataset2): [3522, 2086, 6549, 1241, 5426, 7986, 2035, 2005, 5946, 1966]

Testing different seeds produce different orders...
Datasets with different seeds produce different order: True
  First 10 indices (seed=42): [3522, 2086, 6549, 1241, 5426, 7986, 2035, 2005, 5946, 1966]
  First 10 indices (seed=123): [1060, 2114, 1677, 1225, 6199, 8633, 6496, 2011, 3036, 1781]

Testing dynamic seed changing...
Seed change produces different order: True
  Original (seed=42): [3522, 2086, 6549, 1241, 5426, 7986, 2035, 2005, 5946, 1966]
  New (seed=999):     [8913, 5413, 2948, 3659, 5201, 5262, 5362, 1535, 8635, 3475]
