# Dataset Preprocessor Validation

This notebook verifies that the modified preprocessing pipeline with PyTorch Dataset support creates the correct data structures and handles batching properly with seed support.

## Key Features to Verify:
- Time series data shape `(R, l)` where R=sequences, l=length
- Seed-based reproducible shuffling
- Proper PyTorch Dataset implementation
- Efficient DataLoader batching
- Dynamic seed changing


In [1]:
import sys
import os
import numpy as np
import torch
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

print("Project root added to sys.path:", project_root)

from src.utils.preprocessing_utils import (
    TimeSeriesDataset, 
    create_dataloaders,
    preprocess_data
)

from src.utils.configs_utils import get_dataset_cfgs

print("All imports successful!")


Project root added to sys.path: C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main
All imports successful!


## Example 1: Basic Preprocessing with Seed Support

### Testing: 
- GOOG: `data/GOOG/GOOG.csv`

In [2]:
def print_data_shapes(train_data, valid_data, real_data, shape_labels):
    print(f"Data shapes:")
    print(f"  Train shape: {train_data.shape} {shape_labels['train']}")
    print(f"  Valid shape: {valid_data.shape} {shape_labels['valid']}")
    print(f"  Real shape: {real_data.shape} {shape_labels['real']}")
    if 'sequence_length' in shape_labels:
        print(f"  Sequence length (l): {shape_labels['sequence_length']}")
    if 'num_channels' in shape_labels:
        print(f"  Number of channels (N): {shape_labels['num_channels']}")
    if 'total_samples' in shape_labels:
        print(f"  Total time-series samples (R): {shape_labels['total_samples']}")

def print_channel_statistics(train_data, valid_data, real_data):
    """
    Print summary statistics (min, max, mean) for train, validation, and real data channels.

    Args:
        train_data (np.ndarray or torch.Tensor): Training data array.
        valid_data (np.ndarray or torch.Tensor): Validation data array.
        real_data (np.ndarray or torch.Tensor): Real (original/ground truth) data array.
        num_channels (int): Number of channels in the dataset (typically N).

    This function prints the min, max, and mean for the log returns in each dataset split.
    """
    print(f"\nLog return statistics:")
    train_min = train_data.min()
    train_max = train_data.max()
    train_mean = train_data.mean()
    valid_min = valid_data.min()
    valid_max = valid_data.max()
    valid_mean = valid_data.mean()
    real_min = real_data.min()
    real_max = real_data.max()
    real_mean = real_data.mean()
    
    print(f"  Train range: [{train_min:.4f}, {train_max:.4f}]")
    print(f"  Valid range: [{valid_min:.4f}, {valid_max:.4f}]")
    print(f"  Real range: [{real_min:.4f}, {real_max:.4f}]")
    print(f"  Train mean: {train_mean:.4f}")
    print(f"  Valid mean: {valid_mean:.4f}")
    print(f"  Real mean: {real_mean:.4f}")

def test_non_parametric_preprocessing(train_data, valid_data, real_data):
    """
    Run Example 1.1: Preprocessing for Non-Parametric models
    """
    assert train_data is not None and valid_data is not None and real_data is not None, "Preprocessing 1.1 failed: train_data, valid_data, or real_data is None"
    print(f"\nPreprocessing successful!")
    shape_labels = {
        'train': "(R_train, l)",
        'valid': "(R_valid, l)",
        'real': "(R_real, l)",
        'sequence_length': train_data.shape[1],
        'total_samples': train_data.shape[0] + valid_data.shape[0] + real_data.shape[0],
    }
    print_data_shapes(train_data, valid_data, real_data, shape_labels)
    print_channel_statistics(train_data, valid_data, real_data)

def test_parametric_preprocessing(train_data, valid_data, real_data):
    """
    Run Example 1.2: Preprocessing for Parametric models
    """
    assert train_data is not None and valid_data is not None and real_data is not None, "Preprocessing 1.2 failed: train_data or valid_data is None or real_data is None"
    print(f"\nPreprocessing successful!")
    shape_labels = {
        'train': "(R_train)",
        'valid': "(R_valid)",
        'real': "(R_real)",
        'sequence_length': train_data.shape[0],
        'total_samples': train_data.shape[0] + valid_data.shape[0] + real_data.shape[0],
    }
    print_data_shapes(train_data, valid_data, real_data, shape_labels)
    print_channel_statistics(train_data, valid_data, real_data)

In [3]:
print("=" * 60)
print("EXAMPLE 1: Preprocessing for both parametric and non-parametric models")
print("=" * 60)

nonparametric_dataset_cfgs, parametric_dataset_cfgs = get_dataset_cfgs()

print(f"Configuration Dataset: {nonparametric_dataset_cfgs}")
print("\nStarting preprocessing for non-parametric...")
train_data_np, valid_data_np, real_data_np = preprocess_data(nonparametric_dataset_cfgs)

test_non_parametric_preprocessing(train_data_np, valid_data_np, real_data_np)

print(f"Configuration Dataset: {parametric_dataset_cfgs}")
print("\nStarting preprocessing for parametric...")
train_data_para, valid_data_para, real_data_para = preprocess_data(parametric_dataset_cfgs)

test_parametric_preprocessing(train_data_para, valid_data_para, real_data_para)

EXAMPLE 1: Preprocessing for both parametric and non-parametric models
Configuration Dataset: {'ticker': 'AAPL', 'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\AAPL\\AAPL.csv', 'valid_ratio': 0.1, 'test_ratio': 0.1}

Starting preprocessing for non-parametric...
Preprocessing data for AAPL
Data shape in Find Length: (11306,)

Preprocessing successful!
Data shapes:
  Train shape: (9036, 12) (R_train, l)
  Valid shape: (1129, 12) (R_valid, l)
  Real shape: (1130, 12) (R_real, l)
  Sequence length (l): 12
  Total time-series samples (R): 11295

Log return statistics:
  Train range: [-0.7312, 0.2869]
  Valid range: [-0.1377, 0.1132]
  Real range: [-0.0970, 0.1426]
  Train mean: 0.0006
  Valid mean: 0.0013
  Real mean: 0.0006
Configuration Dataset: {'ticker': 'AAPL', 'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\AAPL\\AAPL.csv', 'valid_ratio': 0.1, 'test_ratio': 0.1, 'is_parametric': True}


## Example 2: PyTorch Dataset and DataLoader Creation

Now let's create PyTorch datasets and dataloaders to verify proper batching and seed support.


In [4]:
print("\n" + "=" * 60)
print("EXAMPLE 2: PyTorch Dataset and DataLoader Creation")
print("=" * 60)

print("Creating TimeSeriesDataset objects...")
train_dataset = TimeSeriesDataset(train_data_np, seed=42)
valid_dataset = TimeSeriesDataset(valid_data_np, seed=42)
real_dataset = TimeSeriesDataset(real_data_np, seed=42)

print(f"Created datasets:")
print(f"  Train dataset length: {len(train_dataset)}")
print(f"  Valid dataset length: {len(valid_dataset)}")
print(f"  Real dataset length: {len(real_dataset)}")
print(f"  Sample shape: {train_dataset[0].shape}")
print(f"  Sample dtype: {train_dataset[0].dtype}")

print(f"\nCreating DataLoaders...")
batch_size = 32
train_loader, valid_loader, real_loader = create_dataloaders(
    train_data_np, valid_data_np, real_data_np,
    batch_size=batch_size,
    train_seed=42,
    valid_seed=42,
    test_seed=42,
    num_workers=0,
    pin_memory=False
)

print(f"Created dataloaders:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Valid batches: {len(valid_loader)}")
print(f"  Real batches: {len(real_loader)}")
print(f"  Batch size: {batch_size}")

print(f"\nBatch Information:")
for i, batch in enumerate(train_loader):
    print(f"Train Batch {i+1}: shape {batch.shape}, dtype {batch.dtype}")
    batch_min = batch.min()
    batch_max = batch.max()
    print(f"  Value range: [{batch_min:.4f}, {batch_max:.4f}]")
    if i >= 2:
        print(f"... and {len(train_loader) - 3} more batches")
        break

first_batch = next(iter(train_loader))
expected_shape = (batch_size, train_data_np.shape[1])
if first_batch.shape == expected_shape:
    print(f"\nBatch shapes are correct: {first_batch.shape} == {expected_shape}")
else:
    print(f"\nBatch shape mismatch: {first_batch.shape} != {expected_shape}")



EXAMPLE 2: PyTorch Dataset and DataLoader Creation
Creating TimeSeriesDataset objects...
Created datasets:
  Train dataset length: 9036
  Valid dataset length: 1129
  Real dataset length: 1130
  Sample shape: torch.Size([12])
  Sample dtype: torch.float32

Creating DataLoaders...
Created dataloaders:
  Train batches: 283
  Valid batches: 36
  Real batches: 36
  Batch size: 32

Batch Information:
Train Batch 1: shape torch.Size([32, 12]), dtype torch.float32
  Value range: [-0.1022, 0.0693]
Train Batch 2: shape torch.Size([32, 12]), dtype torch.float32
  Value range: [-0.1121, 0.1735]
Train Batch 3: shape torch.Size([32, 12]), dtype torch.float32
  Value range: [-0.7312, 0.1061]
... and 280 more batches

Batch shapes are correct: torch.Size([32, 12]) == (32, 12)


## Example 3: Reproducible Training with Seed Control

Let's verify that seeds produce reproducible and different shuffling patterns.


In [6]:
print("\n" + "=" * 60)
print("EXAMPLE 3: Reproducible Training with Seed Control")
print("=" * 60)

print("Testing reproducibility with same seeds...")
dataset1 = TimeSeriesDataset(train_data_np, seed=42, shuffle=True)
dataset2 = TimeSeriesDataset(train_data_np, seed=42, shuffle=True)

indices1 = dataset1.get_original_indices()
indices2 = dataset2.get_original_indices()

print(f"Datasets with same seed produce identical order: {indices1[:10] == indices2[:10]}")
print(f"  First 10 indices (dataset1): {indices1[:10]}")
print(f"  First 10 indices (dataset2): {indices2[:10]}")

print(f"\nTesting different seeds produce different orders...")
dataset3 = TimeSeriesDataset(train_data_np, seed=123, shuffle=True)
indices3 = dataset3.get_original_indices()

print(f"Datasets with different seeds produce different order: {indices1[:10] != indices3[:10]}")
print(f"  First 10 indices (seed=42): {indices1[:10]}")
print(f"  First 10 indices (seed=123): {indices3[:10]}")

print(f"\nTesting dynamic seed changing...")
original_indices = dataset1.get_original_indices()[:10]
dataset1.set_seed(999)
new_indices = dataset1.get_original_indices()[:10]

print(f"Seed change produces different order: {original_indices != new_indices}")
print(f"  Original (seed=42): {original_indices}")
print(f"  New (seed=999):     {new_indices}")



EXAMPLE 3: Reproducible Training with Seed Control
Testing reproducibility with same seeds...
Datasets with same seed produce identical order: True
  First 10 indices (dataset1): [3922, 7105, 4376, 6682, 8214, 332, 5597, 8026, 2794, 6729]
  First 10 indices (dataset2): [3922, 7105, 4376, 6682, 8214, 332, 5597, 8026, 2794, 6729]

Testing different seeds produce different orders...
Datasets with different seeds produce different order: True
  First 10 indices (seed=42): [3922, 7105, 4376, 6682, 8214, 332, 5597, 8026, 2794, 6729]
  First 10 indices (seed=123): [1060, 2114, 1677, 1225, 8060, 8634, 6497, 2011, 3036, 1781]

Testing dynamic seed changing...
Seed change produces different order: True
  Original (seed=42): [3922, 7105, 4376, 6682, 8214, 332, 5597, 8026, 2794, 6729]
  New (seed=999):     [2170, 8215, 89, 3664, 9012, 5357, 6561, 4875, 5216, 3840]
