In [1]:

import sys
import numpy as np
import torch
import mlflow
from pathlib import Path
import time
import json
from datetime import datetime
from typing import Dict, Any

# Dynamically set the project root based on the script's location
project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

from src.preprocessing.preprocessing import preprocess_data

from src.models.base.base_model import ParametricModel, DeepLearningModel
from src.models.parametric.gbm import GeometricBrownianMotion
from src.models.parametric.ou_process import OrnsteinUhlenbeckProcess
from src.models.parametric.merton_jump_diffusion import MertonJumpDiffusion
from src.models.parametric.garch11 import GARCH11
from src.models.parametric.de_jump_diffusion import DoubleExponentialJumpDiffusion
from src.models.non_parametric.time_gan import TimeGAN

from src.taxonomies.diversity import calculate_icd
from src.taxonomies.efficiency import measure_runtime
from src.taxonomies.fidelity import (
    calculate_mdd, calculate_md, calculate_sdd, calculate_sd, calculate_kd, calculate_acd, visualize_tsne, visualize_distribution
)
from src.taxonomies.stylized_facts import (
    heavy_tails, autocorr_raw, volatility_clustering, long_memory_abs, non_stationarity
)

from src.utils.display_utils import show_with_start_divider, show_with_end_divider
from src.utils.transformations_utils import create_dataloaders



In [4]:

class UnifiedEvaluator:
    """
    Unified evaluator for time series generative models using MLFlow for experiment tracking.
    """
    
    def __init__(self, experiment_name: str = "TimeSeries_Generation_Evaluation"):
        """
        Initialize the evaluator with MLFlow experiment.
        
        Args:
            experiment_name (str): Name of the MLFlow experiment
        """
        self.experiment_name = experiment_name
        mlflow.set_experiment(experiment_name)
        
        # Results storage
        self.results = {}
        self.results_dir = Path("results/evaluation_results") # Updated path
        self.results_dir.mkdir(exist_ok=True)
        
        # Create timestamp for this evaluation run
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    def evaluate_parametric(self, 
                      model: ParametricModel, 
                      model_name: str,
                      real_data: np.ndarray,
                      train_data,
                      num_generated_samples: int = 500) -> Dict[str, Any]:
        """
        Evaluate a single parametric model across all metrics.
        Args:
            model: The parametric generative model to evaluate
            model_name: Name of the model for logging
            real_data: Real data for comparison (shape: R, L, N)
            train_loader: Not used for parametric models
            num_generated_samples: Number of samples to generate for evaluation
        Returns:
            Dictionary containing all evaluation metrics
        """
        with mlflow.start_run(run_name=f"{model_name}_{self.timestamp}"):
            # Log model name only
            mlflow.log_param("model_name", model_name)
            
            evaluation_results = {}
            
            print(f"Training {model_name}...")
            train_start = time.time()
            model.fit(train_data)
            train_time = time.time() - train_start
            mlflow.log_metric("training_time", train_time)
            evaluation_results["training_time"] = train_time
            
            print(f"Generating {num_generated_samples} samples...")
            gen_time = measure_runtime(model.generate, num_generated_samples)
            mlflow.log_metric("generation_time_500_samples", gen_time)
            evaluation_results["generation_time_500_samples"] = gen_time

            generated_data = model.generate(num_generated_samples, linear_timestamps=True, output_length=125, seed=42)
            print(f"Generated data shape: {generated_data.shape}")
            
            # 3. Diversity Metrics
            print("Computing diversity metrics...")
            diversity_results = self._evaluate_diversity(generated_data)
            evaluation_results.update(diversity_results)
            
            # 4. Fidelity Metrics
            print("Computing fidelity metrics...")
            fidelity_results = self._evaluate_fidelity(real_data, generated_data)
            evaluation_results.update(fidelity_results)
            
            # 5. Stylized Facts (for financial data)
            print("Computing stylized facts...")
            stylized_results = self._evaluate_stylized_facts(real_data, generated_data)
            evaluation_results.update(stylized_results)
            
            # 6. Visual Assessments
            print("Creating visual assessments...")
            self._create_visual_assessments(real_data, generated_data, model_name)
            
            # Log all metrics to MLFlow and save to results dictionary
            for metric_name, value in evaluation_results.items():
                if isinstance(value, (int, float)):
                    mlflow.log_metric(metric_name, value)
                elif isinstance(value, np.ndarray):
                    mlflow.log_metric(f"{metric_name}_mean", float(np.mean(value)))
                    mlflow.log_metric(f"{metric_name}_std", float(np.std(value)))
                # Save all metrics to results dictionary
                self.results[model_name] = evaluation_results

            # Save evaluation results to JSON
            results_path = self.results_dir / f"metrics_{model_name}_{self.timestamp}.json"
            with open(results_path, 'w') as f:
                json.dump(evaluation_results, f, indent=2, default=str)
            mlflow.log_artifact(str(results_path))
            
            # Save model
            model_path = self.results_dir / f"{model_name}_{self.timestamp}"
            model.save_model(str(model_path))
            mlflow.log_artifacts(str(model_path))
            
            # Save synthetic data
            synthetic_path = self.results_dir / f"synthetic_{model_name}_{self.timestamp}.npy"
            np.save(synthetic_path, generated_data)
            mlflow.log_artifact(str(synthetic_path))
            
            print(f"Evaluation completed for {model_name}")
            return evaluation_results
            
    def evaluate_nonparametric(self, 
                      model: DeepLearningModel, 
                      model_name: str,
                      real_data: np.ndarray,
                      train_loader,
                      num_generated_samples: int = 500) -> Dict[str, Any]:
        """
        Evaluate a single model across all metrics.
        
        Args:
            model: The generative model to evaluate
            model_name: Name of the model for logging
            real_data: Real data for comparison (shape: R, L, N)
            train_loader: Training data loader for model fitting
            num_generated_samples: Number of samples to generate for evaluation
            
        Returns:
            Dictionary containing all evaluation metrics
        """
        show_with_start_divider(f"Evaluating {model_name}")
        
        # Start MLFlow run
        with mlflow.start_run(run_name=f"{model_name}_{self.timestamp}"):
            # Log model name only
            mlflow.log_param("model_name", model_name)
            
            evaluation_results = {}
            
            # 1. Train the model
            print(f"Training {model_name}...")
            train_start = time.time()
            model.fit(train_loader)
            train_time = time.time() - train_start
            mlflow.log_metric("training_time", train_time)
            evaluation_results["training_time"] = train_time
            
            # 2. Generate synthetic data
            print(f"Generating {num_generated_samples} samples...")
            # Efficiency metrics: Measure generation time
            gen_time = measure_runtime(model.generate, num_generated_samples)
            mlflow.log_metric("generation_time_500_samples", gen_time)
            evaluation_results["generation_time_500_samples"] = gen_time

            # Actually generate the synthetic data
            generated_data = model.generate(num_generated_samples)
            
            # Convert to numpy if needed
            if torch.is_tensor(generated_data):
                generated_data = generated_data.detach().cpu().numpy()
            
            # 3. Diversity Metrics
            print("Computing diversity metrics...")
            diversity_results = self._evaluate_diversity(generated_data)
            evaluation_results.update(diversity_results)
            
            # 4. Fidelity Metrics
            print("Computing fidelity metrics...")
            fidelity_results = self._evaluate_fidelity(real_data, generated_data)
            evaluation_results.update(fidelity_results)
            
            # 5. Stylized Facts (for financial data)
            print("Computing stylized facts...")
            stylized_results = self._evaluate_stylized_facts(real_data, generated_data)
            evaluation_results.update(stylized_results)
            
            # 6. Visual Assessments
            print("Creating visual assessments...")
            self._create_visual_assessments(real_data, generated_data, model_name)
            
            # Log all metrics to MLFlow and save to results dictionary
            for metric_name, value in evaluation_results.items():
                if isinstance(value, (int, float)):
                    mlflow.log_metric(metric_name, value)
                elif isinstance(value, np.ndarray):
                    mlflow.log_metric(f"{metric_name}_mean", float(np.mean(value)))
                    mlflow.log_metric(f"{metric_name}_std", float(np.std(value)))
                # Save all metrics to results dictionary
                self.results[model_name] = evaluation_results

            # Save evaluation results to JSON
            results_path = self.results_dir / f"metrics_{model_name}_{self.timestamp}.json"
            with open(results_path, 'w') as f:
                json.dump(evaluation_results, f, indent=2, default=str)
            mlflow.log_artifact(str(results_path))
            
            # Save model
            model_path = self.results_dir / f"{model_name}_{self.timestamp}"
            model.save_model(str(model_path))
            mlflow.log_artifacts(str(model_path))
            
            # Save synthetic data
            synthetic_path = self.results_dir / f"synthetic_{model_name}_{self.timestamp}.npy"
            np.save(synthetic_path, generated_data)
            mlflow.log_artifact(str(synthetic_path))
            
            print(f"Evaluation completed for {model_name}")
            return evaluation_results
    
    def _evaluate_diversity(self, synthetic_data: np.ndarray) -> Dict[str, float]:
        """Evaluate diversity metrics."""
        results = {}
        
        # Intra-Class Distance with Euclidean metric
        icd_euclidean = calculate_icd(synthetic_data, metric="euclidean")
        results["icd_euclidean"] = icd_euclidean
        
        # Intra-Class Distance with DTW metric
        icd_dtw = calculate_icd(synthetic_data, metric="dtw")
        results["icd_dtw"] = icd_dtw
        
        return results
    
    def _evaluate_fidelity(self, real_data: np.ndarray, synthetic_data: np.ndarray) -> Dict[str, float]:
        """Evaluate fidelity metrics."""
        results = {}
        
        # Feature-based metrics
        results["mdd"] = calculate_mdd(real_data, synthetic_data)
        results["md"] = calculate_md(real_data, synthetic_data)
        results["sdd"] = calculate_sdd(real_data, synthetic_data)
        results["sd"] = calculate_sd(real_data, synthetic_data)
        results["kd"] = calculate_kd(real_data, synthetic_data)
        results["acd"] = calculate_acd(real_data, synthetic_data)
        
        return results
    
    def _evaluate_stylized_facts(self, real_data: np.ndarray, synthetic_data: np.ndarray) -> Dict[str, Any]:
        """Evaluate stylized facts for financial data."""
        results = {}
        
        try:
            # Heavy tails (excess kurtosis)
            heavy_tails_real = heavy_tails(real_data)
            heavy_tails_synth = heavy_tails(synthetic_data)
            results["heavy_tails_real"] = heavy_tails_real.tolist()
            results["heavy_tails_synth"] = heavy_tails_synth.tolist()
            results["heavy_tails_diff"] = np.abs(heavy_tails_real - heavy_tails_synth).tolist()
            
            # Autocorrelation of raw returns
            autocorr_real = autocorr_raw(real_data)
            autocorr_synth = autocorr_raw(synthetic_data)
            results["autocorr_raw_real"] = autocorr_real.tolist()
            results["autocorr_raw_synth"] = autocorr_synth.tolist()
            results["autocorr_raw_diff"] = np.abs(autocorr_real - autocorr_synth).tolist()
            
            # Volatility clustering
            vol_clust_real = volatility_clustering(real_data)
            vol_clust_synth = volatility_clustering(synthetic_data)
            results["volatility_clustering_real"] = vol_clust_real.tolist()
            results["volatility_clustering_synth"] = vol_clust_synth.tolist()
            results["volatility_clustering_diff"] = np.abs(vol_clust_real - vol_clust_synth).tolist()
            
            # Long memory in absolute returns
            long_mem_real = long_memory_abs(real_data)
            long_mem_synth = long_memory_abs(synthetic_data)
            results["long_memory_real"] = long_mem_real.tolist()
            results["long_memory_synth"] = long_mem_synth.tolist()
            results["long_memory_diff"] = np.abs(long_mem_real - long_mem_synth).tolist()
            
            # Non-stationarity
            nonstat_real = non_stationarity(real_data)
            nonstat_synth = non_stationarity(synthetic_data)
            results["non_stationarity_real"] = nonstat_real.tolist()
            results["non_stationarity_synth"] = nonstat_synth.tolist()
            results["non_stationarity_diff"] = np.abs(nonstat_real - nonstat_synth).tolist()
            
        except Exception as e:
            print(f"Warning: Stylized facts evaluation failed: {e}")
            results["stylized_facts_error"] = str(e)
        
        return results
    
    def _create_visual_assessments(self, real_data: np.ndarray, synthetic_data: np.ndarray, model_name: str):
        """Create visual assessment plots."""
        try:
            # Create model-specific results directory
            model_results_dir = self.results_dir / f"visualizations_{model_name}_{self.timestamp}"
            model_results_dir.mkdir(exist_ok=True)
            
            # t-SNE visualization
            visualize_tsne(real_data, synthetic_data, str(model_results_dir), model_name)
            
            # Distribution visualization
            visualize_distribution(real_data, synthetic_data, str(model_results_dir), model_name)
            
            # Log visualizations to MLFlow
            mlflow.log_artifacts(str(model_results_dir))
            
        except Exception as e:
            print(f"Warning: Visual assessment failed: {e}")
    
    def run_complete_evaluation(self, 
                              nonpara_cfg: Dict[str, Any],
                              para_cfg: Dict[str, Any],
                              models_config: Dict[str, Any],
                              num_samples: int = 500) -> Dict[str, Any]:
        """
        Run complete evaluation on all models with 500 generated samples per model.
        
        Args:
            dataset_config: Configuration for data preprocessing
            models_config: Configuration for models
            num_samples: Number of samples to generate for evaluation
            
        Returns:
            Dictionary containing results for all models
        """
        show_with_start_divider("Starting Complete Evaluation Pipeline")
        
        # Separate preprocessing for parametric and non-parametric models
        print("Preprocessing data for non-parametric models...")
        train_data_np, valid_data_np = preprocess_data(nonpara_cfg)
        train_data_para, valid_data_para = preprocess_data(para_cfg)
        
        # Create data loaders for non-parametric models
        batch_size = 32
        train_loader, valid_loader = create_dataloaders(
            train_data_np, valid_data_np,
            batch_size=batch_size,
            train_seed=42,
            valid_seed=123,
            num_workers=0,
            pin_memory=False
        )
        
        # Get data dimensions
        num_samples_real, length, num_channels = train_data_np.shape
        print(f"Non-parametric data shape: {train_data_np.shape}")
        print(f"Parametric data shape: {train_data_para.shape}")
        
        # Initialize models
        models = {}
        
        # Parametric models
        models["GBM"] = GeometricBrownianMotion(length=length, num_channels=num_channels)
        models["OU_Process"] = OrnsteinUhlenbeckProcess(length=length, num_channels=num_channels)
        models["Merton_Jump_Diffusion"] = MertonJumpDiffusion(length=length, num_channels=num_channels)
        models["GARCH11"] = GARCH11(length=length, num_channels=num_channels)
        models["Double_Exponential_Jump_Diffusion"] = DoubleExponentialJumpDiffusion(length=length, num_channels=num_channels)

        # Non-parametric models
        models["TimeGAN"] = TimeGAN(
            l=length,
            N=num_channels,
            latent_dim=models_config.get("latent_dim", 64),
            hidden_dim=models_config.get("hidden_dim", 128),
            lr=models_config.get("lr", 0.0002)
        )
        
        # Evaluate each model
        all_results = {}
        for model_name, model in models.items():
            try:
                if isinstance(model, ParametricModel):
                    results = self.evaluate_parametric(
                        model=model,
                        model_name=model_name,
                        real_data=valid_data_para,  # Parametric data
                        train_data=train_data_para,  # Training data for parametric models
                        num_generated_samples=num_samples
                    )
                else:
                    results = self.evaluate_nonparametric(
                        model=model,
                        model_name=model_name,
                        real_data=valid_data_np,  # Non-parametric data
                        train_loader=train_loader,  # DataLoader for non-parametric models
                        num_generated_samples=num_samples
                    )
                all_results[model_name] = results
            except Exception as e:
                print(f"Error evaluating {model_name}: {e}")
                all_results[model_name] = {"error": str(e)}
        
        # Save comprehensive results for all models
        results_file = self.results_dir / f"complete_evaluation_{self.timestamp}.json"
        with open(results_file, 'w') as f:
            json.dump(self.results, f, indent=2, default=str)
        mlflow.log_artifact(str(results_file))
        
        show_with_end_divider("EVALUATION COMPLETE")
        print(f"Results saved to: {results_file}")
        print(f"MLFlow experiment: {self.experiment_name}")
        
        return all_results
def main():
    """Main function to run the evaluation pipeline."""
    nonpara_ds_cfg = {
        'original_data_path': str(project_root / 'data' / 'raw' / 'GOOG' / 'GOOG.csv'),
        'valid_ratio': 0.2,
        'do_normalization': True,
        'seed': 42
    }
    
    para_ds_cfg = {
        'original_data_path': str(project_root / 'data' / 'raw' / 'GOOG' / 'GOOG.csv'),
        'is_parametric': True,
        'seed': 42
    }
    
    # Configuration for models
    TimeGAN_cfg = {
        'latent_dim': 64,
        'hidden_dim': 128,
        'lr': 0.0002,
        'n_critic': 5,
        'clip_value': 0.01
    }
    
    # Initialize evaluator
    evaluator = UnifiedEvaluator(experiment_name="TimeSeries_Generation_Comprehensive_Evaluation")
    
    # Run complete evaluation
    results = evaluator.run_complete_evaluation(
        para_cfg=para_ds_cfg,
        nonpara_cfg=nonpara_ds_cfg,
        models_config=TimeGAN_cfg,
        num_samples=500
    )
    
    # Print summary
    show_with_start_divider("EVALUATION SUMMARY") # Using utility function
    
    for model_name, model_results in results.items():
        print(f"\n{model_name}:")
        if "error" in model_results:
            print(f"  Error: {model_results['error']}")
        else:
            print(f"  Training Time: {model_results.get('training_time', 'N/A'):.2f}s")
            print(f"  Generation Time (500 samples): {model_results.get('generation_time_500_samples', 'N/A'):.4f}s")
            print(f"  MDD: {model_results.get('mdd', 'N/A'):.4f}")
            print(f"  MD: {model_results.get('md', 'N/A'):.4f}")
            print(f"  SDD: {model_results.get('sdd', 'N/A'):.4f}")
            print(f"  ICD (Euclidean): {model_results.get('icd_euclidean', 'N/A'):.4f}")
            print(f"  ICD (DTW): {model_results.get('icd_dtw', 'N/A'):.4f}")

In [5]:
if __name__ == "__main__":
    main()


Starting Complete Evaluation Pipeline
Preprocessing data for non-parametric models...
Data preprocessing with settings:{'original_data_path': 'C:\\Users\\14165\\SWE_Work\\Unified-benchmark-for-SDGFTS\\data\\raw\\GOOG\\GOOG.csv', 'valid_ratio': 0.2, 'do_normalization': True, 'seed': 42}
Data shape: (1131, 125, 5)
Preprocessing done.

Data preprocessing with settings:{'original_data_path': 'C:\\Users\\14165\\SWE_Work\\Unified-benchmark-for-SDGFTS\\data\\raw\\GOOG\\GOOG.csv', 'is_parametric': True, 'seed': 42}
Data shape: (1255, 5)
Non-parametric data shape: (904, 125, 5)
Parametric data shape: torch.Size([1129, 5])


  df['Date'] = df['Date'].view('int64') // 10**9
  df['Date'] = df['Date'].view('int64') // 10**9


Training GBM...
Generating 500 samples...
Generated data shape: torch.Size([500, 125, 5])
Computing diversity metrics...
Error evaluating GBM: 'Tensor' object has no attribute 'astype'
Training OU_Process...
Generating 500 samples...
Generated data shape: torch.Size([500, 125, 5])
Computing diversity metrics...


KeyboardInterrupt: 