# Model Validation Notebook

This notebook provides a comprehensive validation for all implemented time series generative models (parametric and non-parametric). It covers the entire pipeline from data preprocessing to model training and synthetic data generation, ensuring that each model functions as expected and produces output in the desired format `(R, l, N)`.

## Table of Contents:
1.  [Setup and Imports](#Setup-and-Imports)
2.  [Data Preprocessing](#Data-Preprocessing)
3.  [Parametric Model Validation](#Parametric-Model-Validation)
    *   [Geometric Brownian Motion](#Geometric-Brownian-Motion)
    *   [Ornstein-Uhlenbeck Process](#Ornstein-Uhlenbeck-Process)
4.  [Non-Parametric Model Validation](#Non-Parametric-Model-Validation)
    *   [Vanilla GAN](#Vanilla-GAN)
    *   [Wasserstein GAN](#Wasserstein-GAN)



In [1]:
import sys
import os
import numpy as np
import torch
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

print(f"Project root added to sys.path: {project_root}")

from src.preprocessing.preprocessing import (
    preprocess_data, 
    load_preprocessed_data,
    create_dataset_from_preprocessed,
)
from src.preprocessing.transformers import (
    TimeSeriesDataset,
    create_dataloaders
)

from src.models.base.base_model import BaseGenerativeModel, ParametricModel, DeepLearningModel
from src.models.parametric.gbm import GeometricBrownianMotion
from src.models.parametric.ou_process import OrnsteinUhlenbeckProcess

from src.models.non_parametric.vanilla_gan import VanillaGAN
from src.models.non_parametric.wasserstein_gan import WassersteinGAN

print("All necessary modules imported successfully!")

Project root added to sys.path: C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main
All necessary modules imported successfully!


## Data Preprocessing

This section demonstrates how to preprocess a sample dataset (`GOOG.csv`) using the provided utilities and create PyTorch `DataLoader` objects. This data will be used to train and validate our generative models.



In [2]:
TEST_OUTPUT_FOLDER = str(Path().resolve() / 'test_data_output')

In [3]:
# --- Normalized Data ---
config_goog_norm = {
    'original_data_path': str(project_root / 'data' / 'raw' / 'GOOG' / 'GOOG.csv'),
    'output_ori_path': TEST_OUTPUT_FOLDER,
    'dataset_name': 'test_normalized',
    'valid_ratio': 0.1,
    'do_normalization': True,
    'seed': 42
}

print(f"Preprocessing normalized data with config: {config_goog_norm}")

train_data_norm_np, valid_data_norm_np = preprocess_data(config_goog_norm)

batch_size = 32
train_loader_norm, valid_loader_norm = create_dataloaders(
    train_data_norm_np, valid_data_norm_np,
    batch_size=batch_size,
    train_seed=42,
    valid_seed=123,
    num_workers=0,
    pin_memory=False
)

print(f"\n[Normalized] Train data shape: {train_data_norm_np.shape}")
print(f"[Normalized] Valid data shape: {valid_data_norm_np.shape}")
print(f"[Normalized] Number of training batches: {len(train_loader_norm)}")
print(f"[Normalized] Number of validation batches: {len(valid_loader_norm)}")

num_samples_real, length, num_channels = train_data_norm_np.shape
print(f"\n[Normalized] Inferred model output dimensions: length={length}, num_channels={num_channels}")



Preprocessing normalized data with config: {'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'output_ori_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\notebooks\\test_data_output', 'dataset_name': 'test_normalized', 'valid_ratio': 0.1, 'do_normalization': True, 'seed': 42}
Data preprocessing with settings:{'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'output_ori_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\notebooks\\test_data_output', 'dataset_name': 'test_normalized', 'valid_ratio': 0.1, 'do_normalization': True, 'seed': 42}
Data shape: (1132, 125, 5)


Preprocessing done. Preprocessed files saved to C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\notebooks\test_data_output\test_normalized.


[Normalized] Train data shape: (1018, 125, 5)
[Normalized] Valid data shape: (114, 125, 5)
[Normalized] Number of training batches: 32
[Normalized] Number of validation batches: 4

[Normalized] Inferred model output dimensions: length=125, num_channels=5


In [4]:
# --- Unnormalized Data ---
config_goog_unnorm = {
    'original_data_path': str(project_root / 'data' / 'raw' / 'GOOG' / 'GOOG.csv'),
    'output_ori_path': TEST_OUTPUT_FOLDER,
    'dataset_name': 'test_unnormalized',
    'valid_ratio': 0.1,
    'do_normalization': False,
    'seed': 42
}

print(f"\nPreprocessing unnormalized data with config: {config_goog_unnorm}")

train_data_unnorm_np, valid_data_unnorm_np = preprocess_data(config_goog_unnorm)

train_loader_unnorm, valid_loader_unnorm = create_dataloaders(
    train_data_unnorm_np, valid_data_unnorm_np,
    batch_size=batch_size,
    train_seed=42,
    valid_seed=123,
    num_workers=0,
    pin_memory=False
)

print(f"\n[Unnormalized] Train data shape: {train_data_unnorm_np.shape}")
print(f"[Unnormalized] Valid data shape: {valid_data_unnorm_np.shape}")
print(f"[Unnormalized] Number of training batches: {len(train_loader_unnorm)}")
print(f"[Unnormalized] Number of validation batches: {len(valid_loader_unnorm)}")



Preprocessing unnormalized data with config: {'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'output_ori_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\notebooks\\test_data_output', 'dataset_name': 'test_unnormalized', 'valid_ratio': 0.1, 'do_normalization': False, 'seed': 42}
Data preprocessing with settings:{'original_data_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\data\\raw\\GOOG\\GOOG.csv', 'output_ori_path': 'C:\\Users\\14165\\Downloads\\Unified-benchmark-for-SDGFTS-main\\notebooks\\test_data_output', 'dataset_name': 'test_unnormalized', 'valid_ratio': 0.1, 'do_normalization': False, 'seed': 42}
Data shape: (1132, 125, 5)


Preprocessing done. Preprocessed files saved to C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\notebooks\test_data_output\test_unnormalized.


[Unnormalized] Train data shape: (1018, 125, 5)
[Unnormalized] Valid data shape: (114, 125, 5)
[Unnormalized] Number of training batches: 32
[Unnormalized] Number of validation batches: 4


## Parametric Model Validation

This section validates the functionality of each parametric time series generative model. For each model, we will:
1.  Instantiate the model with appropriate parameters.
2.  Train the model using the preprocessed training data.
3.  Generate new synthetic time series samples.
4.  Verify the shape and basic statistics of the generated data.



### Geometric Brownian Motion



In [5]:
print("\n" + "=" * 50)
print("Validating Geometric Brownian Motion (GBM)")
print("=" * 50)

# Instantiate GBM model
gbm_model = GeometricBrownianMotion(length=length, num_channels=num_channels)
print(f"GBM Model instantiated: {gbm_model}")

# Fit the model
print("Fitting GBM model...")
gbm_model.fit(train_loader_unnorm)
print(f"GBM model parameters after fitting: mu={gbm_model.mu.data}, sigma={gbm_model.sigma.data}")

# Generate samples
num_generated_samples = 100
gbm_generated_data = gbm_model.generate(num_generated_samples)
print(f"Generated GBM data shape: {gbm_generated_data.shape}")

# Validation checks
assert gbm_generated_data.shape == (num_generated_samples, length, num_channels), \
    f"GBM: Generated data shape mismatch. Expected ({num_generated_samples}, {length}, {num_channels}), got {gbm_generated_data.shape}"
print("GBM: Generated data shape is correct.")

print(f"GBM: Generated data min: {gbm_generated_data.min():.4f}, max: {gbm_generated_data.max():.4f}, mean: {gbm_generated_data.mean():.4f}")




Validating Geometric Brownian Motion (GBM)
GBM Model instantiated: <src.models.parametric.gbm.GeometricBrownianMotion object at 0x00000288D170DBB0>
Fitting GBM model...
GBM model parameters after fitting: mu=tensor([-3.7506e-06, -3.6300e-06, -3.7506e-06, -3.6532e-06,  5.7218e-07]), sigma=tensor([0.0349, 0.0338, 0.0343, 0.0350, 0.3321])
Generated GBM data shape: torch.Size([100, 125, 5])
GBM: Generated data shape is correct.
GBM: Generated data min: 0.4420, max: 2.2903, mean: 0.9966


### Ornstein-Uhlenbeck Process



In [6]:
print("\n" + "=" * 50)
print("Validating Ornstein-Uhlenbeck (O-U) Process")
print("=" * 50)

# Instantiate O-U model
ou_model = OrnsteinUhlenbeckProcess(length=length, num_channels=num_channels)
print(f"O-U Model instantiated: {ou_model}")

# Fit the model
print("Fitting O-U model...")
ou_model.fit(train_loader_unnorm)
print(f"O-U model parameters after fitting: theta={ou_model.theta.data}, mu={ou_model.mu.data}, sigma={ou_model.sigma.data}")

# Generate samples
num_generated_samples = 100
ou_generated_data = ou_model.generate(num_generated_samples)
print(f"Generated O-U data shape: {ou_generated_data.shape}")

# Validation checks
assert ou_generated_data.shape == (num_generated_samples, length, num_channels), \
    f"O-U: Generated data shape mismatch. Expected ({num_generated_samples}, {length}, {num_channels}), got {ou_generated_data.shape}"
print("O-U: Generated data shape is correct.")

print(f"O-U: Generated data min: {ou_generated_data.min():.4f}, max: {ou_generated_data.max():.4f}, mean: {ou_generated_data.mean():.4f}")




Validating Ornstein-Uhlenbeck (O-U) Process
O-U Model instantiated: <src.models.parametric.ou_process.OrnsteinUhlenbeckProcess object at 0x00000288D198E060>
Fitting O-U model...
O-U model parameters after fitting: theta=tensor([10., 10., 10., 10., 10.]), mu=tensor([1.3685e+02, 1.3843e+02, 1.3541e+02, 1.3693e+02, 2.4590e+07]), sigma=tensor([1., 1., 1., 1., 1.])
Generated O-U data shape: torch.Size([100, 125, 5])
O-U: Generated data shape is correct.
O-U: Generated data min: 134.5321, max: 24590500.0000, mean: 4918209.5000


## Non-Parametric Model Validation

This section validates the functionality of each non-parametric (GAN-based) time series generative model. For each model, we will:
1.  Instantiate the model with appropriate parameters.
2.  Train the model using the preprocessed training data.
3.  Generate new synthetic time series samples.
4.  Verify the shape and basic statistics of the generated data.

Note: GAN training can be unstable and convergence is not guaranteed with simple validation. This is primarily to check code execution and output format.



### Vanilla GAN



In [9]:
print("\n" + "=" * 50)
print("Validating Vanilla GAN")
print("=" * 50)

# Instantiate Vanilla GAN model
# Using a smaller num_epochs for quicker validation, adjust as needed
vanilla_gan_model = VanillaGAN(length=length, num_channels=num_channels, latent_dim=64, hidden_dim=128, lr=0.0002)
print(f"Vanilla GAN Model instantiated: {vanilla_gan_model}")

# Fit the model
print("Fitting Vanilla GAN model (this may take a while)...")
vanilla_gan_model.fit(train_loader_norm, num_epochs=50)
print("Vanilla GAN model fitting complete.")

# Generate samples
num_generated_samples = 100
vanilla_gan_generated_data = vanilla_gan_model.generate(num_generated_samples)
print(f"Generated Vanilla GAN data shape: {vanilla_gan_generated_data.shape}")

# Validation checks
assert vanilla_gan_generated_data.shape == (num_generated_samples, length, num_channels), \
    f"Vanilla GAN: Generated data shape mismatch. Expected ({num_generated_samples}, {length}, {num_channels}), got {vanilla_gan_generated_data.shape}"
print("Vanilla GAN: Generated data shape is correct.")

print(f"Vanilla GAN: Generated data min: {vanilla_gan_generated_data.min():.4f}, max: {vanilla_gan_generated_data.max():.4f}, mean: {vanilla_gan_generated_data.mean():.4f}")




Validating Vanilla GAN
Vanilla GAN Model instantiated: VanillaGAN(
  (generator): Generator(
    (model): Sequential(
      (0): Linear(in_features=64, out_features=128, bias=True)
      (1): LeakyReLU(negative_slope=0.2)
      (2): Linear(in_features=128, out_features=256, bias=True)
      (3): LeakyReLU(negative_slope=0.2)
      (4): Linear(in_features=256, out_features=625, bias=True)
    )
  )
  (discriminator): Discriminator(
    (model): Sequential(
      (0): Linear(in_features=625, out_features=256, bias=True)
      (1): LeakyReLU(negative_slope=0.2)
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): LeakyReLU(negative_slope=0.2)
      (4): Linear(in_features=128, out_features=1, bias=True)
      (5): Sigmoid()
    )
  )
  (bce_loss): BCELoss()
)
Fitting Vanilla GAN model (this may take a while)...
Vanilla GAN model fitting complete.
Generated Vanilla GAN data shape: torch.Size([100, 125, 5])
Vanilla GAN: Generated data shape is correct.
Vanilla GAN: Ge

### Wasserstein GAN



In [10]:
print("\n" + "=" * 50)
print("Validating Wasserstein GAN")
print("=" * 50)

# Instantiate Wasserstein GAN model
# Using a smaller num_epochs for quicker validation, adjust as needed
wasserstein_gan_model = WassersteinGAN(length=length, num_channels=num_channels, latent_dim=64, hidden_dim=128, lr=0.00005, n_critic=5, clip_value=0.01)
print(f"Wasserstein GAN Model instantiated: {wasserstein_gan_model}")

# Fit the model
print("Fitting Wasserstein GAN model (this may take a while)...")
wasserstein_gan_model.fit(train_loader_norm, num_epochs=50)
print("Wasserstein GAN model fitting complete.")

# Generate samples
num_generated_samples = 100
wasserstein_gan_generated_data = wasserstein_gan_model.generate(num_generated_samples)
print(f"Generated Wasserstein GAN data shape: {wasserstein_gan_generated_data.shape}")

# Validation checks
assert wasserstein_gan_generated_data.shape == (num_generated_samples, length, num_channels), \
    f"Wasserstein GAN: Generated data shape mismatch. Expected ({num_generated_samples}, {length}, {num_channels}), got {wasserstein_gan_generated_data.shape}"
print("Wasserstein GAN: Generated data shape is correct.")

print(f"Wasserstein GAN: Generated data min: {wasserstein_gan_generated_data.min():.4f}, max: {wasserstein_gan_generated_data.max():.4f}, mean: {wasserstein_gan_generated_data.mean():.4f}")




Validating Wasserstein GAN
Wasserstein GAN Model instantiated: WassersteinGAN(
  (generator): Generator(
    (model): Sequential(
      (0): Linear(in_features=64, out_features=128, bias=True)
      (1): LeakyReLU(negative_slope=0.2)
      (2): Linear(in_features=128, out_features=256, bias=True)
      (3): LeakyReLU(negative_slope=0.2)
      (4): Linear(in_features=256, out_features=625, bias=True)
    )
  )
  (critic): Critic(
    (model): Sequential(
      (0): Linear(in_features=625, out_features=256, bias=True)
      (1): LeakyReLU(negative_slope=0.2)
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): LeakyReLU(negative_slope=0.2)
      (4): Linear(in_features=128, out_features=1, bias=True)
    )
  )
)
Fitting Wasserstein GAN model (this may take a while)...
Wasserstein GAN model fitting complete.
Generated Wasserstein GAN data shape: torch.Size([100, 125, 5])
Wasserstein GAN: Generated data shape is correct.
Wasserstein GAN: Generated data min: -0.2017, 