# Dataset Preprocessor Validation

This notebook verifies that the modified preprocessing pipeline with PyTorch Dataset support creates the correct data structures and handles batching properly with seed support.

## Key Features to Verify:
- Time series data shape `(R, l)` where R=sequences, l=length
- Seed-based reproducible shuffling
- Proper PyTorch Dataset implementation
- Efficient DataLoader batching
- Dynamic seed changing


In [1]:
import sys
import torch
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

print("Project root added to sys.path:", project_root)

from src.utils.preprocessing_utils import (
    TimeSeriesDataset, 
    LogReturnTransformation,
    create_dataloaders,
    preprocess_data
)

from src.utils.configs_utils import get_dataset_cfgs

print("All imports successful!")


Project root added to sys.path: /Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS
All imports successful!


## Example 1: Basic Preprocessing with Seed Support

### Testing: 
- AAPL: `data/raw/AAPL/AAPL.csv`

In [2]:
def print_data_shapes(train_data, valid_data, real_data, shape_labels):
    print(f"Data shapes:")
    print(f"  Train shape: {train_data.shape} {shape_labels['train']}")
    print(f"  Valid shape: {valid_data.shape} {shape_labels['valid']}")
    print(f"  Real shape: {real_data.shape} {shape_labels['real']}")
    if 'sequence_length' in shape_labels:
        print(f"  Sequence length (l): {shape_labels['sequence_length']}")
    if 'num_channels' in shape_labels:
        print(f"  Number of channels (N): {shape_labels['num_channels']}")
    if 'total_samples' in shape_labels:
        print(f"  Total time-series samples (R): {shape_labels['total_samples']}")

def print_channel_statistics(train_data, valid_data, real_data):
    """
    Print summary statistics (min, max, mean) for train, validation, and real data.
    """
    print(f"\nLog return statistics:")
    train_min, train_max, train_mean = train_data.min(), train_data.max(), train_data.mean()
    valid_min, valid_max, valid_mean = valid_data.min(), valid_data.max(), valid_data.mean()
    real_min, real_max, real_mean = real_data.min(), real_data.max(), real_data.mean()
    
    print(f"  Train range: [{train_min:.4f}, {train_max:.4f}] | mean: {train_mean:.4f}")
    print(f"  Valid range: [{valid_min:.4f}, {valid_max:.4f}] | mean: {valid_mean:.4f}")
    print(f"  Real range: [{real_min:.4f}, {real_max:.4f}] | mean: {real_mean:.4f}")

def test_non_parametric_preprocessing(train_data, valid_data, real_data, train_init, valid_init, real_init):
    """
    Test non-parametric preprocessing with initial value verification
    """
    assert train_data is not None and valid_data is not None and real_data is not None, \
        "Preprocessing failed: train, valid, or real data is None"
    assert train_init is not None and valid_init is not None and real_init is not None, \
        "Initial values are None"

    print(f"\nNon-Parametric Preprocessing successful!")

    shape_labels = {
        'train': "(R_train, l)",
        'valid': "(R_valid, l)",
        'real': "(R_real, l)",
        'sequence_length': train_data.shape[1],
        'total_samples': train_data.shape[0] + valid_data.shape[0] + real_data.shape[0],
    }
    print_data_shapes(train_data, valid_data, real_data, shape_labels)
    print_channel_statistics(train_data, valid_data, real_data)

    scaler = LogReturnTransformation()
    
    reconstructed_train = scaler.inverse_transform(train_data[0], train_init[0])
    reconstructed_valid = scaler.inverse_transform(valid_data[0], valid_init[0])

    assert torch.isclose(reconstructed_train[0], train_init[0]), "Train initial value reconstruction failed"
    assert torch.isclose(reconstructed_valid[0], valid_init[0]), "Valid initial value reconstruction failed"
    
    print("Initial value unit tests passed for non-parametric dataset.")

def test_dataset_shuffling_preserves_initial_values(data, initial_values, seed=42):
    """
    Test that shuffling in TimeSeriesDataset preserves initial value positions correctly.
    
    Args:
        data: Array of shape (R, l)
        initial_values: Array of shape (R,)
        seed: Random seed for shuffling
    """
    dataset_shuffled = TimeSeriesDataset(data, shuffle=True, seed=seed, initial_values=initial_values)
    dataset_unshuffled = TimeSeriesDataset(data, shuffle=False, seed=seed, initial_values=initial_values)
    
    indices = dataset_shuffled.get_original_indices()
    
    for i in range(min(10, len(dataset_shuffled))):
        sample_shuffled, init_shuffled = dataset_shuffled[i]
        actual_idx = indices[i]
        
        sample_original, init_original = dataset_unshuffled[actual_idx]
        init_original = initial_values[actual_idx]
        
        assert torch.allclose(sample_shuffled, sample_original), \
            f"Sample mismatch at position {i}: shuffled index {actual_idx}"
        assert torch.isclose(init_shuffled, init_original), \
            f"Initial value mismatch at position {i}: shuffled index {actual_idx}"
    
    print(f"  ✓ Shuffling preserves initial value positions (tested {min(10, len(dataset_shuffled))} samples)")

def test_dataloader_batching_preserves_initial_values(data, initial_values, batch_size=32, seed=42):
    """
    Test that DataLoader batching preserves initial value positions correctly.
    
    Args:
        data: Array of shape (R, l)
        initial_values: Array of shape (R,)
        batch_size: Batch size for DataLoader
        seed: Random seed for shuffling
    """
    train_loader, _, _ = create_dataloaders(
        data, data, data,
        batch_size=batch_size,
        train_seed=seed,
        train_initial=initial_values,
        valid_initial=initial_values,
        test_initial=initial_values
    )
    
    dataset_unshuffled = TimeSeriesDataset(data, shuffle=False, initial_values=initial_values)
    shuffled_indices = train_loader.dataset.get_original_indices()
    
    batch_count = 0
    total_samples_checked = 0
    
    for batch_idx, batch in enumerate(train_loader):
        if batch_idx >= 3:
            break
        
        batch_data, batch_initial = batch
        
        for i in range(batch_data.shape[0]):
            position_in_dataset = batch_idx * batch_size + i
            if position_in_dataset >= len(data):
                break
            
            actual_original_idx = shuffled_indices[position_in_dataset]
            sample_original, init_original = dataset_unshuffled[actual_original_idx]
            
            assert torch.allclose(batch_data[i], sample_original), \
                f"Batch {batch_idx}, sample {i}: data mismatch (position {position_in_dataset} -> original idx {actual_original_idx})"
            assert torch.isclose(batch_initial[i], init_original), \
                f"Batch {batch_idx}, sample {i}: initial value mismatch (position {position_in_dataset} -> original idx {actual_original_idx})"
            
            total_samples_checked += 1
        
        batch_count += 1
    
    print(f"  ✓ DataLoader batching preserves initial value positions (tested {batch_count} batches, {total_samples_checked} samples)")

def test_dataloader_reconstruction_with_initial_values(data, initial_values, batch_size=32, seed=42):
    """
    Test that we can reconstruct prices from log returns using initial values from DataLoader batches.
    
    Args:
        data: Log returns array of shape (R, l)
        initial_values: Initial prices array of shape (R,)
        batch_size: Batch size for DataLoader
        seed: Random seed for shuffling
    """
    train_loader, _, _ = create_dataloaders(
        data, data, data,
        batch_size=batch_size,
        train_seed=seed,
        train_initial=initial_values,
        valid_initial=initial_values,
        test_initial=initial_values
    )
    
    scaler = LogReturnTransformation()
    batch_count = 0
    
    for batch_idx, batch in enumerate(train_loader):
        if batch_idx >= 2:
            break
        
        batch_data, batch_initial = batch
        
        for i in range(min(5, batch_data.shape[0])):
            log_returns = batch_data[i]
            init_price = batch_initial[i]
            
            reconstructed = scaler.inverse_transform(log_returns, init_price)
            
            assert torch.isclose(reconstructed[0], init_price), \
                f"Batch {batch_idx}, sample {i}: reconstruction failed"
        
        batch_count += 1
    
    print(f"  ✓ Price reconstruction from DataLoader batches works correctly (tested {batch_count} batches)")

def test_parametric_preprocessing(train_data, valid_data, real_data, train_init, valid_init, real_init):
    """
    Test parametric preprocessing with initial value verification
    """
    assert train_data is not None and valid_data is not None and real_data is not None, \
        "Preprocessing failed: train, valid, or real data is None"
    assert train_init is not None and valid_init is not None and real_init is not None, \
        "Initial values are None"

    print(f"\nParametric Preprocessing successful!")

    shape_labels = {
        'train': "(R_train,)",
        'valid': "(R_valid,)",
        'real': "(R_real,)",
        'sequence_length': train_data.shape[0],
        'total_samples': train_data.shape[0] + valid_data.shape[0] + real_data.shape[0],
    }
    print_data_shapes(train_data, valid_data, real_data, shape_labels)
    print_channel_statistics(train_data, valid_data, real_data)

    scaler = LogReturnTransformation()
    
    reconstructed_train = scaler.inverse_transform(train_data, train_init)
    reconstructed_valid = scaler.inverse_transform(valid_data, valid_init)
    
    assert torch.isclose(reconstructed_train[0], train_init), "Train initial value reconstruction failed"
    assert torch.isclose(reconstructed_valid[0], valid_init), "Valid initial value reconstruction failed"

    print("Initial value unit tests passed for parametric dataset.")

In [3]:
print("=" * 60)
print("EXAMPLE 1: Preprocessing for both parametric and non-parametric models")
print("=" * 60)

nonparametric_dataset_cfgs, parametric_dataset_cfgs = get_dataset_cfgs()

print(f"Configuration Dataset: {nonparametric_dataset_cfgs}")
print("\nStarting preprocessing for non-parametric...")
train_data_np, valid_data_np, real_data_np, train_init_np, valid_init_np, real_init_np = preprocess_data(nonparametric_dataset_cfgs)

test_non_parametric_preprocessing(train_data_np, valid_data_np, real_data_np, train_init_np, valid_init_np, real_init_np)

print(f"\n{'='*60}")
print("Testing TimeSeriesDataset shuffling and DataLoader batching with initial values")
print(f"{'='*60}")

print("\nTest 1: Dataset shuffling preserves initial value positions")
test_dataset_shuffling_preserves_initial_values(train_data_np, train_init_np, seed=42)

print("\nTest 2: DataLoader batching preserves initial value positions")
test_dataloader_batching_preserves_initial_values(train_data_np, train_init_np, batch_size=32, seed=42)

print("\nTest 3: Price reconstruction from DataLoader batches")
test_dataloader_reconstruction_with_initial_values(train_data_np, train_init_np, batch_size=32, seed=42)

print(f"\nConfiguration Dataset: {parametric_dataset_cfgs}")
print("\nStarting preprocessing for parametric...")
train_data_para, valid_data_para, real_data_para, train_init_para, valid_init_para, real_init_para = preprocess_data(parametric_dataset_cfgs)

test_parametric_preprocessing(train_data_para, valid_data_para, real_data_para, train_init_para, valid_init_para, real_init_para)

EXAMPLE 1: Preprocessing for both parametric and non-parametric models
Configuration Dataset: {'ticker': 'AAPL', 'original_data_path': '/Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS/data/raw/AAPL/AAPL.csv', 'valid_ratio': 0.1, 'test_ratio': 0.1}

Starting preprocessing for non-parametric...
Preprocessing data for AAPL
Desired time series sample length (lag with max PACF >0): 103
PACF at that lag: 0.040741497942971425

Non-Parametric Preprocessing successful!
Data shapes:
  Train shape: torch.Size([8975, 103]) (R_train, l)
  Valid shape: torch.Size([1122, 103]) (R_valid, l)
  Real shape: torch.Size([1122, 103]) (R_real, l)
  Sequence length (l): 103
  Total time-series samples (R): 11219

Log return statistics:
  Train range: [-0.7312, 0.2869] | mean: 0.0006
  Valid range: [-0.1377, 0.1132] | mean: 0.0014
  Real range: [-0.0970, 0.1426] | mean: 0.0005
Initial value unit tests passed for non-parametric dataset.

Testing TimeSeriesDataset shuffling and DataLoader batching with 

## Example 2: PyTorch Dataset and DataLoader Creation

Now let's create PyTorch datasets and dataloaders to verify proper batching and seed support.


In [4]:
print("\n" + "=" * 60)
print("EXAMPLE 2: PyTorch Dataset and DataLoader Creation")
print("=" * 60)

print("Creating TimeSeriesDataset objects...")
train_dataset = TimeSeriesDataset(train_data_np, train_init_np, seed=42)
valid_dataset = TimeSeriesDataset(valid_data_np, valid_init_np, seed=42)
real_dataset = TimeSeriesDataset(real_data_np, real_init_np, seed=42)

print(f"Type of train_dataset: {type(train_dataset)}")


print(f"Created datasets:")
print(f"  Train dataset length: {len(train_dataset)}")
print(f"  Valid dataset length: {len(valid_dataset)}")
print(f"  Real dataset length: {len(real_dataset)}")
print(f"  Sample shape: {train_dataset[0][0].shape}")
print(f"  Sample dtype: {train_dataset[0][0].dtype}")
print(f"  Sample initial: {train_dataset[0][1]}")

print(f"\nCreating DataLoaders...")
batch_size = 32
train_loader, valid_loader, real_loader = create_dataloaders(
    train_data_np, valid_data_np, real_data_np,
    batch_size=batch_size,
    train_seed=42,
    valid_seed=42,
    test_seed=42,
    num_workers=0,
    pin_memory=False,
    train_initial=train_init_np,
    valid_initial=valid_init_np,
    test_initial=real_init_np
)

print(f"Created dataloaders:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Valid batches: {len(valid_loader)}")
print(f"  Real batches: {len(real_loader)}")
print(f"  Batch size: {batch_size}")

print(f"\nBatch Information:")  
for i, (batch, initial) in enumerate(train_loader):
    print(f"Train Batch {i+1}: shape {batch.shape}, dtype {batch.dtype}")
    batch_min = batch.min()
    batch_max = batch.max()
    print(f"  Value range: [{batch_min:.4f}, {batch_max:.4f}]")
    print(f"  Initial values shape: {initial.shape}, dtype: {initial.dtype}")
    if i >= 2:
        print(f"... and {len(train_loader) - 3} more batches")
        break

first_batch, _ = next(iter(train_loader))
expected_shape = (batch_size, train_data_np.shape[1])
if first_batch.shape == expected_shape:
    print(f"\nBatch shapes are correct: {first_batch.shape} == {expected_shape}")
else:
    print(f"\nBatch shape mismatch: {first_batch.shape} != {expected_shape}")


EXAMPLE 2: PyTorch Dataset and DataLoader Creation
Creating TimeSeriesDataset objects...
Type of train_dataset: <class 'src.utils.preprocessing_utils.TimeSeriesDataset'>
Created datasets:
  Train dataset length: 8975
  Valid dataset length: 1122
  Real dataset length: 1122
  Sample shape: torch.Size([103])
  Sample dtype: torch.float32
  Sample initial: 0.1283479928970337

Creating DataLoaders...
Created dataloaders:
  Train batches: 281
  Valid batches: 36
  Real batches: 36
  Batch size: 32

Batch Information:
Train Batch 1: shape torch.Size([32, 103]), dtype torch.float32
  Value range: [-0.2624, 0.2127]
  Initial values shape: torch.Size([32]), dtype: torch.float64
Train Batch 2: shape torch.Size([32, 103]), dtype torch.float32
  Value range: [-0.7312, 0.1735]
  Initial values shape: torch.Size([32]), dtype: torch.float64
Train Batch 3: shape torch.Size([32, 103]), dtype torch.float32
  Value range: [-0.1390, 0.2127]
  Initial values shape: torch.Size([32]), dtype: torch.float64
.

## Example 3: Reproducible Training with Seed Control

Let's verify that seeds produce reproducible and different shuffling patterns.


In [5]:
print("\n" + "=" * 60)
print("EXAMPLE 3: Reproducible Training with Seed Control")
print("=" * 60)

print("Testing reproducibility with same seeds...")
dataset1 = TimeSeriesDataset(train_data_np, train_init_np, seed=42, shuffle=True)
dataset2 = TimeSeriesDataset(train_data_np, train_init_np, seed=42, shuffle=True)

indices1 = dataset1.get_original_indices()
indices2 = dataset2.get_original_indices()

print(f"Datasets with same seed produce identical order: {indices1[:10] == indices2[:10]}")
print(f"  First 10 indices (dataset1): {indices1[:10]}")
print(f"  First 10 indices (dataset2): {indices2[:10]}")

print(f"\nTesting different seeds produce different orders...")
dataset3 = TimeSeriesDataset(train_data_np, train_init_np, seed=123, shuffle=True)
indices3 = dataset3.get_original_indices()

print(f"Datasets with different seeds produce different order: {indices1[:10] != indices3[:10]}")
print(f"  First 10 indices (seed=42): {indices1[:10]}")
print(f"  First 10 indices (seed=123): {indices3[:10]}")

print(f"\nTesting dynamic seed changing...")    
original_indices = dataset1.get_original_indices()[:10]
dataset1.set_seed(999)
new_indices = dataset1.get_original_indices()[:10]

print(f"Seed change produces different order: {original_indices != new_indices}")
print(f"  Original (seed=42): {original_indices}")
print(f"  New (seed=999):     {new_indices}")



EXAMPLE 3: Reproducible Training with Seed Control
Testing reproducibility with same seeds...
Datasets with same seed produce identical order: True
  First 10 indices (dataset1): [5060, 6335, 3381, 3940, 970, 2747, 7252, 1548, 7168, 6424]
  First 10 indices (dataset2): [5060, 6335, 3381, 3940, 970, 2747, 7252, 1548, 7168, 6424]

Testing different seeds produce different orders...
Datasets with different seeds produce different order: True
  First 10 indices (seed=42): [5060, 6335, 3381, 3940, 970, 2747, 7252, 1548, 7168, 6424]
  First 10 indices (seed=123): [3478, 8063, 8893, 3825, 2071, 7416, 7652, 4861, 853, 2011]

Testing dynamic seed changing...
Seed change produces different order: True
  Original (seed=42): [5060, 6335, 3381, 3940, 970, 2747, 7252, 1548, 7168, 6424]
  New (seed=999):     [2037, 8148, 1176, 8367, 2838, 5121, 2555, 5939, 976, 6585]
