In [None]:

import sys
import numpy as np
import torch
import mlflow
from pathlib import Path
import time
import json
from datetime import datetime
from typing import Dict, Any
import yaml
from abc import ABC, abstractmethod
import inspect

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

from src.models.base.base_model import ParametricModel, DeepLearningModel
from src.models.parametric.gbm import GeometricBrownianMotion
from src.models.parametric.ou_process import OUProcess
from src.models.parametric.merton_jump_diffusion import MertonJumpDiffusion
from src.models.parametric.garch11 import GARCH11
from src.models.parametric.de_jump_diffusion import DoubleExponentialJumpDiffusion
from src.models.non_parametric.block_bootstrap import BlockBootstrap
from src.models.non_parametric.time_gan import TimeGAN
from src.models.non_parametric.quant_gan import QuantGAN
from src.models.non_parametric.time_vae import TimeVAE
from src.models.non_parametric.sig_wgan import SigWGAN

from src.utils.display_utils import show_with_start_divider, show_with_end_divider
from src.utils.preprocessing_utils import create_dataloaders, preprocess_data, LogReturnTransformation
from src.utils.configs_utils import get_dataset_cfgs
from src.utils.evaluation_classes_utils import (
    TaxonomyEvaluator,
    DiversityEvaluator,
    FidelityEvaluator,
    RuntimeEvaluator,
    StylizedFactsEvaluator,
    VisualAssessmentEvaluator
)
import pandas as pd

In [2]:
class UnifiedEvaluator:
    """
    Unified evaluator class to initialize the MLFlow experiment and evaluate the models.
    """

    def __init__(
        self,
        experiment_name: str,
        parametric_dataset_cfgs: Dict[str, Any]
    ):
        """
        Initialize the evaluator with MLFlow experiment.

        Args:
            experiment_name (str): Name of the MLFlow experiment
            parametric_dataset_cfgs (Dict[str, Any]): Configuration for parametric dataset
        """
        self.parametric_dataset_cfgs = parametric_dataset_cfgs

        self.experiment_name = experiment_name
        mlflow.set_experiment(experiment_name)

        self.results = {}
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.results_dir = project_root / "results" / f"evaluation_{self.timestamp}"
        self.results_dir.mkdir(parents=True, exist_ok=True)

    def evaluate_model(
        self,
        model,
        model_name: str,
        real_data: np.ndarray,
        train_data,
        generation_kwargs: Dict[str, Any] = None
    ) -> None:
        """
        Unified evaluation for both parametric.

        Args:
            model: The type of generative model to evaluate
            model_name: Name of the model for logging
            real_data: Real data for comparison
            train_data: Training data
            generation_kwargs: Optional kwargs for model.generate()

        Returns:
            Dictionary containing all evaluation metrics
        """
        show_with_start_divider(f"Evaluating {model_name}")
        generation_kwargs = generation_kwargs or {}
        num_samples = generation_kwargs.get('num_samples', 500)

        with mlflow.start_run(run_name=f"{model_name}_{self.timestamp}"):
            mlflow.log_param("model_name", model_name)

            evaluation_results: Dict[str, Any] = {}

            print(f"Training {model_name}...")
            model.fit(train_data)

            print(f"\nGenerating {num_samples} samples...")
            runtime_evaluator = RuntimeEvaluator(
                generate_func=model.generate,
                generation_kwargs=generation_kwargs
            )
            runtime_results = runtime_evaluator.evaluate()
            mlflow.log_metric(
                f"generation_time_{num_samples}_samples",
                runtime_results[f"generation_time_{num_samples}_samples"]
            )
            evaluation_results.update(runtime_results)

            generated_data = model.generate(**generation_kwargs)

            if "torch" in str(type(generated_data)):
                generated_data = generated_data.detach().cpu().numpy()
            if "torch" in str(type(real_data)):
                real_data = real_data.detach().cpu().numpy()
            else:
                real_data = np.asarray(real_data)

            if real_data.ndim == 2:
                l, N = real_data.shape
                B = generated_data.shape[1]
                num_windows = l - B + 1
                real_data_3d = np.lib.stride_tricks.sliding_window_view(real_data, (B, N), axis=(0, 1)).squeeze()
                idx = np.arange(real_data_3d.shape[0])
                np.random.shuffle(idx)
                real_data_3d = real_data_3d[idx]
                A_real = min(num_windows, num_samples)
                real_data = real_data_3d[:A_real]

            print(f"Generated data shape: {generated_data.shape}")
            print(f"Real data shape: {real_data.shape}")

            model_dir = self.results_dir / model_name
            model_dir.mkdir(parents=True, exist_ok=True)

            # Perform evaluation on all channels at once
            # Combined evaluators for all channels, no loop over individual channels
            evaluators = [
                FidelityEvaluator(real_data, generated_data),
                DiversityEvaluator(real_data, generated_data),
                StylizedFactsEvaluator(real_data, generated_data),
                VisualAssessmentEvaluator(real_data, generated_data, model_dir, self.timestamp)
            ]

            all_results = {}

            for evaluator in evaluators:
                print(f"Computing {evaluator.__class__.__name__}...")
                results = evaluator.evaluate(model_name) if isinstance(evaluator, VisualAssessmentEvaluator) else evaluator.evaluate()
                if results is not None:
                    all_results.update(results)
                    for metric_name, metric_score in results.items():
                        # Try to log metrics also per-channel if shape allows, else overall
                        if isinstance(metric_score, (np.ndarray, list)):
                            metric_score = np.array(metric_score)
                            if metric_score.ndim == 1 and metric_score.shape[0] == real_data.shape[-1]:  # Per-channel vector
                                for ch, val in enumerate(metric_score):
                                    mlflow.log_metric(f"{metric_name}_ch{ch}", float(val))
                                mlflow.log_metric(f"{metric_name}_mean", float(np.mean(metric_score)))
                                mlflow.log_metric(f"{metric_name}_std", float(np.std(metric_score)))
                            else:
                                mlflow.log_metric(f"{metric_name}_mean", float(np.mean(metric_score)))
                                mlflow.log_metric(f"{metric_name}_std", float(np.std(metric_score)))
                        elif isinstance(metric_score, (int, float)):
                            mlflow.log_metric(metric_name, float(metric_score))

            metrics_path = model_dir / "metrics.json"
            with open(metrics_path, 'w') as f:
                json.dump(all_results, f, indent=2, default=str)

            mlflow.log_artifact(str(metrics_path))
            print(f"Evaluation completed for {model_name} (results saved at {metrics_path}).")

            return all_results

    def run_complete_evaluation(self, num_samples: int = 500, seed: int = 42) -> Dict[str, Any]:
        """
        Run complete evaluation on all models with 500 generated samples per model.

        Args:
            dataset_config: Configuration for data preprocessing
            models_config: Configuration for models

        Returns:
            Dictionary containing results for all models
        """
        show_with_start_divider("Starting Complete Evaluation Pipeline")
        train_data_para, valid_data_para, test_data_para = preprocess_data(self.parametric_dataset_cfgs)
        length, num_channels = train_data_para.shape
        print(f"  - Parametric data shape: {train_data_para.shape}")

        models = {}

        models["GBM"] = GeometricBrownianMotion(length=length, num_channels=num_channels)
        models["OU_Process"] = OUProcess(length=length, num_channels=num_channels)
        models["MJD"] = MertonJumpDiffusion(length=length, num_channels=num_channels)
        models["GARCH11"] = GARCH11(length=length, num_channels=num_channels)
        models["DEJD"] = DoubleExponentialJumpDiffusion(length=length, num_channels=num_channels)
        models["BlockBootstrap"] = BlockBootstrap(block_size=13)

        generation_kwargs = {'num_samples': num_samples, 'seq_length': 13, 'seed': 42}

        all_results = {}
        for model_name, model in models.items():
            results = self.evaluate_model(
                model=model,
                model_name=model_name,
                real_data=valid_data_para,
                train_data=train_data_para,
                generation_kwargs=generation_kwargs
            )
            all_results[model_name] = results

        results_file = self.results_dir / "complete_evaluation.json"
        with open(results_file, 'w') as f:
            json.dump(all_results, f, indent=2, default=str)
        mlflow.log_artifact(str(results_file))

        show_with_end_divider("EVALUATION COMPLETE")
        print(f"Results saved to: {results_file}")
        print(f"MLFlow experiment: {self.experiment_name}")

        return all_results

def main():
    """Main function to run the evaluation pipeline."""
    _, parametric_dataset_cfgs = get_dataset_cfgs()
    evaluator = UnifiedEvaluator(
        experiment_name="TimeSeries_Generation_Comprehensive_Evaluation",
        parametric_dataset_cfgs = parametric_dataset_cfgs
    )
    evaluator.run_complete_evaluation(num_samples=500)

In [3]:
if __name__ == "__main__":
    main()

Starting Complete Evaluation Pipeline
Preprocessing data for AAPL
  - Parametric data shape: torch.Size([9044, 4])
Evaluating GBM
Training GBM...

Generating 500 samples...
Generated data shape: (500, 13, 4)
Real data shape: (500, 13, 4)
Computing FidelityEvaluator...
Computing DiversityEvaluator...
Computing StylizedFactsEvaluator...
Computing VisualAssessmentEvaluator...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.007s...
[t-SNE] Computed neighbors for 1000 samples in 0.098s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.016523




[t-SNE] KL divergence after 100 iterations with early exaggeration: 63.460220
[t-SNE] KL divergence after 300 iterations: 1.956183
Evaluation completed for GBM (results saved at C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\GBM\metrics.json).
Evaluating OU_Process
Training OU_Process...

Generating 500 samples...
Generated data shape: (500, 13, 4)
Real data shape: (500, 13, 4)
Computing FidelityEvaluator...
Computing DiversityEvaluator...
Computing StylizedFactsEvaluator...
Computing VisualAssessmentEvaluator...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.003s...
[t-SNE] Computed neighbors for 1000 samples in 0.097s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.016440




[t-SNE] KL divergence after 50 iterations with early exaggeration: 63.404530
[t-SNE] KL divergence after 300 iterations: 1.962538
Evaluation completed for OU_Process (results saved at C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\OU_Process\metrics.json).
Evaluating MJD
Training MJD...

Generating 500 samples...
Generated data shape: (500, 13, 4)
Real data shape: (500, 13, 4)
Computing FidelityEvaluator...
Computing DiversityEvaluator...
Computing StylizedFactsEvaluator...
Computing VisualAssessmentEvaluator...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.003s...
[t-SNE] Computed neighbors for 1000 samples in 0.095s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.016020




[t-SNE] KL divergence after 50 iterations with early exaggeration: 63.473103
[t-SNE] KL divergence after 300 iterations: 1.954886
Evaluation completed for MJD (results saved at C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\MJD\metrics.json).
Evaluating GARCH11
Training GARCH11...

Generating 500 samples...
Generated data shape: (500, 13, 4)
Real data shape: (500, 13, 4)
Computing FidelityEvaluator...
Computing DiversityEvaluator...
Computing StylizedFactsEvaluator...
Computing VisualAssessmentEvaluator...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.003s...
[t-SNE] Computed neighbors for 1000 samples in 0.095s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.012429




[t-SNE] KL divergence after 50 iterations with early exaggeration: 63.968811
[t-SNE] KL divergence after 300 iterations: 1.995350
Evaluation completed for GARCH11 (results saved at C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\GARCH11\metrics.json).
Evaluating DEJD
Training DEJD...

Generating 500 samples...


  log_returns = torch.tensor(log_returns, device=self.device)


Generated data shape: (500, 13, 4)
Real data shape: (500, 13, 4)
Computing FidelityEvaluator...
Computing DiversityEvaluator...
Computing StylizedFactsEvaluator...
Computing VisualAssessmentEvaluator...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.003s...
[t-SNE] Computed neighbors for 1000 samples in 0.096s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.014801




[t-SNE] KL divergence after 50 iterations with early exaggeration: 63.592873
[t-SNE] KL divergence after 300 iterations: 1.853360
Evaluation completed for DEJD (results saved at C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\DEJD\metrics.json).
Evaluating BlockBootstrap
Training BlockBootstrap...

Generating 500 samples...
Generated data shape: (500, 13, 4)
Real data shape: (500, 13, 4)
Computing FidelityEvaluator...
Computing DiversityEvaluator...
Computing StylizedFactsEvaluator...
Computing VisualAssessmentEvaluator...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.003s...
[t-SNE] Computed neighbors for 1000 samples in 0.097s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.015564




[t-SNE] KL divergence after 50 iterations with early exaggeration: 63.278942
[t-SNE] KL divergence after 300 iterations: 1.943496
Evaluation completed for BlockBootstrap (results saved at C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\BlockBootstrap\metrics.json).
EVALUATION COMPLETE

Results saved to: C:\Users\14165\Downloads\Unified-benchmark-for-SDGFTS-main\results\evaluation_20251028_202025\complete_evaluation.json
MLFlow experiment: TimeSeries_Generation_Comprehensive_Evaluation
