# Dataset Preprocessor Verification

This notebook verifies that the modified preprocessing pipeline with PyTorch Dataset support creates the correct data structures and handles batching properly with seed support.

## Key Features to Verify:
- Time series data shape `(R, l, N)` where R=sequences, l=length, N=variables
- Seed-based reproducible shuffling
- Proper PyTorch Dataset implementation
- Efficient DataLoader batching
- Dynamic seed changing


In [1]:
import sys
import os
import numpy as np
import torch
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

print("Project root added to sys.path:", project_root)

from src.preprocessing.preprocessing import preprocess_data

from src.preprocessing.transformers import (
    TimeSeriesDataset, 
    create_dataloaders
)

print("All imports successful!")


Project root added to sys.path: C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main
All imports successful!


## Example 1: Basic Preprocessing with Seed Support

### Testing: 
- GOOG: `data/GOOG/GOOG.csv`

In [2]:
def print_data_shapes(train_data, valid_data, shape_labels):
    print(f"Data shapes:")
    print(f"  Train shape: {train_data.shape} {shape_labels['train']}")
    print(f"  Valid shape: {valid_data.shape} {shape_labels['valid']}")
    if 'sequence_length' in shape_labels:
        print(f"  Sequence length (l): {shape_labels['sequence_length']}")
    if 'num_channels' in shape_labels:
        print(f"  Number of channels (N): {shape_labels['num_channels']}")
    if 'total_samples' in shape_labels:
        print(f"  Total time-series samples (R): {shape_labels['total_samples']}")

def print_channel_statistics(train_data, valid_data, num_channels, is_normalized=False):
    print(f"\nData statistics:")
    for channel in range(num_channels):
        # Defensive: handle both 2D and 3D data
        if train_data.ndim == 3:
            train_min = train_data[:, :, channel].min()
            train_max = train_data[:, :, channel].max()
            train_mean = train_data[:, :, channel].mean()
        else:
            train_min = train_data[:, channel].min()
            train_max = train_data[:, channel].max()
            train_mean = train_data[:, channel].mean()

        if valid_data.ndim == 3:
            valid_min = valid_data[:, :, channel].min()
            valid_max = valid_data[:, :, channel].max()
            valid_mean = valid_data[:, :, channel].mean()
        else:
            valid_min = valid_data[:, channel].min()
            valid_max = valid_data[:, channel].max()
            valid_mean = valid_data[:, channel].mean()

        print(f"  Channel {channel}:")
        print(f"    Train data range: [{train_min:.4f}, {train_max:.4f}]")
        print(f"    Valid data range: [{valid_min:.4f}, {valid_max:.4f}]")
        print(f"    Train data mean: {train_mean:.4f}")
        print(f"    Valid data mean: {valid_mean:.4f}")

        # Optional: normalization check
        if is_normalized:
            if train_min >= 0 and train_max <= 1:
                print(f"  Normalization successful: data in range [0, 1]")
            else:
                print(f"  Normalization issue: data outside [0, 1] range")

def test_non_parametric_preprocessing(train_data, valid_data):
    """
    Run Example 1.1: Preprocessing for Non-Parametric models
    """
    if train_data is not None and valid_data is not None:
        print(f"\nPreprocessing successful!")
        shape_labels = {
            'train': "(R_train, l, N)",
            'valid': "(R_valid, l, N)",
            'sequence_length': train_data.shape[1],
            'num_channels': train_data.shape[2],
            'total_samples': train_data.shape[0] + valid_data.shape[0],
        }
        print_data_shapes(train_data, valid_data, shape_labels)
        print_channel_statistics(train_data, valid_data, train_data.shape[2], is_normalized=True)
    else:
        print("Preprocessing failed...")

def test_parametric_preprocessing(train_data, valid_data):
    """
    Run Example 1.2: Preprocessing for Parametric models
    """
    if train_data is not None and valid_data is not None:
        print(f"\nPreprocessing successful!")
        shape_labels = {
            'train': "(l, N)",
            'valid': "(l, N)",
            'sequence_length': train_data.shape[0],
            'num_channels': train_data.shape[1],
        }
        print_data_shapes(train_data, valid_data, shape_labels)
        print_channel_statistics(train_data, valid_data, train_data.shape[1], is_normalized=False)
    else:
        print("Preprocessing failed...")

In [None]:
print("=" * 60)
print("EXAMPLE 1.1: Preprocessing for Non-Parametric models")
print("=" * 60)

config_non_parametric = {
    'original_data_path': str(project_root / 'data' / 'raw' / 'GOOG' / 'GOOG.csv'),
    'valid_ratio': 0.1,
    'do_normalization': True,
    'seed': 42
}

print(f"Configuration GOOG Dataset: {config_non_parametric}")
print("\nStarting preprocessing for non-parametric...")
train_data_goog, valid_data_goog = preprocess_data(config_non_parametric)

test_non_parametric_preprocessing(train_data_goog[:,:,1:], valid_data_goog[:,:,1:])

EXAMPLE 1.1: Preprocessing for Non-Parametric models
Configuration GOOG Dataset: {'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'valid_ratio': 0.1, 'do_normalization': True, 'seed': 42}

Starting preprocessing for non-parametric...
Data preprocessing with settings:{'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'valid_ratio': 0.1, 'do_normalization': True, 'seed': 42}
Data shape: (1132, 125, 6)
Preprocessing done.


Preprocessing successful!
Data shapes:
  Train shape: (1018, 125, 6) (R_train, l, N)
  Valid shape: (114, 125, 6) (R_valid, l, N)
  Sequence length (l): 125
  Number of channels (N): 6
  Total time-series samples (R): 1132

Data statistics:
  Channel 0:
    Train data range: [1602475200.0000, 1760068800.0000]
    Valid data range: [1603944000.0000, 1759464000.0000]
    Train data mean: 1681146116.4165
    Valid data mean: 1681576207.8316
 

  df['Date'] = df['Date'].view('int64') // 10**9


In [None]:
print("=" * 60)
print("EXAMPLE 1.2: Preprocessing for Parametric models")
print("=" * 60)

config_parametric = {
    'original_data_path': str(project_root / 'data' / 'raw' / 'GOOG' / 'GOOG.csv'),
    'valid_ratio': 0.1,
    'is_parametric': True

}

print(f"Configuration GOOG Dataset: {config_parametric}")
print("\nStarting preprocessing for parametric...")
train_data_para, valid_data_para = preprocess_data(config_parametric)

test_parametric_preprocessing(train_data_para[:,:,1:], valid_data_para[:,:,1:])

EXAMPLE 1.2: Preprocessing for Parametric models
Configuration GOOG Dataset: {'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'valid_ratio': 0.1, 'is_parametric': True}

Starting preprocessing for parametric...
Data preprocessing with settings:{'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'valid_ratio': 0.1, 'is_parametric': True}
Data shape: (1256, 6)

Preprocessing successful!
Data shapes:
  Train shape: torch.Size([1130, 6]) (l, N)
  Valid shape: torch.Size([126, 6]) (l, N)
  Sequence length (l): 1130
  Number of channels (N): 6

Data statistics:
  Channel 0:
    Train data range: [1602475200.0000, 1744257600.0000]
    Valid data range: [1744344000.0000, 1760068800.0000]
    Train data mean: 1673267148.3186
    Valid data mean: 1752351085.7143
  Channel 1:
    Train data range: [76.1180, 204.5000]
    Valid data range: [150.9650, 254.7800]
    Trai

  df['Date'] = df['Date'].view('int64') // 10**9


## Example 2: PyTorch Dataset and DataLoader Creation

Now let's create PyTorch datasets and dataloaders to verify proper batching and seed support.


In [7]:
print("\n" + "=" * 60)
print("EXAMPLE 2: PyTorch Dataset and DataLoader Creation")
print("=" * 60)

print("Creating TimeSeriesDataset objects...")
train_dataset = TimeSeriesDataset(train_data_goog, seed=42)
valid_dataset = TimeSeriesDataset(valid_data_goog, seed=123)

print(f"Created datasets:")
print(f"  Train dataset length: {len(train_dataset)}")
print(f"  Valid dataset length: {len(valid_dataset)}")
print(f"  Sample shape: {train_dataset[0].shape}")
print(f"  Sample dtype: {train_dataset[0].dtype}")

sample = train_dataset[0]
if isinstance(sample, torch.Tensor):
    print(f"  Sample is PyTorch tensor: {type(sample)}")
else:
    print(f"  Sample is not PyTorch tensor: {type(sample)}")

print(f"\nCreating DataLoaders...")
batch_size = 32
train_loader, valid_loader = create_dataloaders(
    train_data_goog, valid_data_goog,
    batch_size=batch_size,
    train_seed=42,
    valid_seed=123,
    num_workers=0,
    pin_memory=False
)

print(f"Created dataloaders:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Valid batches: {len(valid_loader)}")
print(f"  Batch size: {batch_size}")

print(f"\nBatch Information:")
for i, batch in enumerate(train_loader):
    print(f"Batch {i+1}: shape {batch.shape}, dtype {batch.dtype}")
    feature_channels = batch[:, :, 1:]
    if feature_channels.shape[-1] > 0:
        ranges = []
        for ch in range(feature_channels.shape[-1]):
            ch_min = feature_channels[:, :, ch].min()
            ch_max = feature_channels[:, :, ch].max()
            ranges.append(f"[{ch_min:.4f}, {ch_max:.4f}]")
        print(f"  Per-feature value range (excluding time channel): {ranges}")
    else:
        print("  No feature channels (only time present)")
    if i >= 2:
        print(f"... and {len(train_loader) - 3} more batches")
        break

first_batch = next(iter(train_loader))
expected_shape = (batch_size, train_data_goog.shape[1], train_data_goog.shape[2])
if first_batch.shape == expected_shape:
    print(f"\nBatch shapes are correct: {first_batch.shape} == {expected_shape}")
else:
    print(f"\nBatch shape mismatch: {first_batch.shape} != {expected_shape}")



EXAMPLE 2: PyTorch Dataset and DataLoader Creation
Creating TimeSeriesDataset objects...
Created datasets:
  Train dataset length: 1018
  Valid dataset length: 114
  Sample shape: torch.Size([125, 6])
  Sample dtype: torch.float32
  Sample is PyTorch tensor: <class 'torch.Tensor'>

Creating DataLoaders...
Created dataloaders:
  Train batches: 32
  Valid batches: 4
  Batch size: 32

Batch Information:
Batch 1: shape torch.Size([32, 125, 6]), dtype torch.float32
  Per-feature value range (excluding time channel): ['[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0000]']
Batch 2: shape torch.Size([32, 125, 6]), dtype torch.float32
  Per-feature value range (excluding time channel): ['[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0000]']
Batch 3: shape torch.Size([32, 125, 6]), dtype torch.float32
  Per-feature value range (excluding time channel): ['[0.0000, 1.0000]', '[0.0000, 1.0000]', '[0.0000, 1.0

## Example 3: Reproducible Training with Seed Control

Let's verify that seeds produce reproducible and different shuffling patterns.


In [8]:
print("\n" + "=" * 60)
print("EXAMPLE 3: Reproducible Training with Seed Control")
print("=" * 60)

print("Testing reproducibility with same seeds...")
dataset1 = TimeSeriesDataset(train_data_goog, seed=42)
dataset2 = TimeSeriesDataset(train_data_goog, seed=42)

indices1 = dataset1.get_original_indices()
indices2 = dataset2.get_original_indices()

print(f"Datasets with same seed produce identical order: {indices1[:10] == indices2[:10]}")
print(f"  First 10 indices (dataset1): {indices1[:10]}")
print(f"  First 10 indices (dataset2): {indices2[:10]}")

print(f"\nTesting different seeds produce different orders...")
dataset3 = TimeSeriesDataset(train_data_goog, seed=123)
indices3 = dataset3.get_original_indices()

print(f"Datasets with different seeds produce different order: {indices1[:10] != indices3[:10]}")
print(f"  First 10 indices (seed=42): {indices1[:10]}")
print(f"  First 10 indices (seed=123): {indices3[:10]}")

print(f"\nTesting dynamic seed changing...")
original_indices = dataset1.get_original_indices()[:10]
dataset1.set_seed(999)
new_indices = dataset1.get_original_indices()[:10]

print(f"Seed change produces different order: {original_indices != new_indices}")
print(f"  Original (seed=42): {original_indices}")
print(f"  New (seed=999):     {new_indices}")



EXAMPLE 3: Reproducible Training with Seed Control
Testing reproducibility with same seeds...
Datasets with same seed produce identical order: True
  First 10 indices (dataset1): [272, 859, 927, 365, 1014, 290, 790, 211, 946, 894]
  First 10 indices (dataset2): [272, 859, 927, 365, 1014, 290, 790, 211, 946, 894]

Testing different seeds produce different orders...
Datasets with different seeds produce different order: True
  First 10 indices (seed=42): [272, 859, 927, 365, 1014, 290, 790, 211, 946, 894]
  First 10 indices (seed=123): [936, 244, 526, 469, 573, 712, 847, 257, 635, 672]

Testing dynamic seed changing...
Seed change produces different order: True
  Original (seed=42): [272, 859, 927, 365, 1014, 290, 790, 211, 946, 894]
  New (seed=999):     [700, 859, 964, 373, 390, 946, 156, 416, 352, 107]
