# Benchmark

> Fill in a module description here

In [None]:
#| default_exp benchmark

In [None]:
#| export
from typing import List, Optional, Callable, Dict, Any, Iterator, Tuple
from pathlib import Path
import os
import h5py
import random
import numpy as np
import itertools
import identibench.metrics
from identibench.utils import get_default_data_root,_load_sequences_from_files

In [None]:
#| hide
import shutil
import time # For testing modification times
from fastcore.test import test_eq, test_ne# Import nbdev testing functions
from identibench.utils import _dummy_dataset_loader

## Benchmark Specifications

In [None]:
#| exporti
# Forward declaration needed for type hint in BenchmarkSpec.test_func
class BenchmarkSpec: pass 

In [None]:
#| exporti
def _test_simulation(specs, model):
    test_dir = specs.dataset_path / 'test'
    test_files = sorted(list(test_dir.glob('*.hdf5'))) 

    if not test_files:
        raise RuntimeError(f"No test files found in {test_dir}") 

    all_scores = []
    for u_test, y_test, _ in _load_sequences_from_files(test_files, specs.u_cols, specs.y_cols, specs.x_cols):
        y_pred = model(u_test)
        score = specs.metric_func(y_test, y_pred)
        all_scores.append(score)
            
    if not all_scores:
        final_score = np.nan 
        print(f"Warning: No valid scores calculated for benchmark {specs.name}.")
    else:
        final_score = np.mean(all_scores).item() # Ensure scalar float

    return {'metric_score': final_score}

In [None]:
#| exporti
def _test_prediction(specs: BenchmarkSpec, model: Callable):
    test_dir = specs.dataset_path / 'test'
    test_files = sorted(list(test_dir.glob('*.hdf5'))) 

    if not test_files:
        raise RuntimeError(f"No test files found in {test_dir}") 

    all_scores = []
    for u_test, y_test, _ in _load_sequences_from_files(test_files, specs.u_cols, specs.y_cols, specs.x_cols):
        y_pred = model(u_test)
        score = specs.metric_func(y_test, y_pred)
        all_scores.append(score)
            
    if not all_scores:
        final_score = np.nan 
        print(f"Warning: No valid scores calculated for benchmark {specs.name}.")
    else:
        final_score = np.mean(all_scores).item() # Ensure scalar float

    return {'metric_score': final_score}

In [None]:
#| export
class BenchmarkSpec:
    """
    Specification for a single, standardized benchmark dataset configuration.

    Defines fixed parameters for dataset loading, preprocessing, evaluation metric,
    and potentially a custom testing function. Specific evaluation logic 
    (simulation vs prediction, windowing) is handled by the benchmark execution function 
    or the custom test_func.
    """
    # Explicit __init__ for nbdev documentation compatibility
    def __init__(self, 
                 name: str, # Unique name identifying this specific benchmark task (e.g., 'silverbox_sim_rmse').
                 dataset_id: str, # Identifier for the raw dataset source (e.g., 'silverbox'), corresponds to subdirectory name.
                 u_cols: List[str], # List of column names for the input signals (u).
                 y_cols: List[str], # List of column names for the output signals (y).
                 metric_func: Callable[[np.ndarray, np.ndarray], float], # Primary metric for final test evaluation. `func(y_true, y_pred)`.
                 x_cols: Optional[List[str]] = None, # Optional list of column names for state inputs (x).
                 sampling_time: Optional[float] = None, # Optional sampling time (in seconds) if constant for the dataset.
                 download_func: Optional[Callable[[Path, bool], None]] = None, # Function to download/prepare the raw dataset. `func(save_path, force_download)`
                 test_sim_func: Optional[Callable[[BenchmarkSpec, Callable[[np.ndarray], np.ndarray]], Dict[str, Any]]] = _test_simulation, #
                 test_pred_func: Optional[Callable[[BenchmarkSpec, Callable[[np.ndarray], np.ndarray]], Dict[str, Any]]] = _test_prediction, #
                 init_window: Optional[int] = None, # Number of initial steps potentially used for model initialization (simulation or prediction).
                 pred_horizon: Optional[int] = None, # The 'k' in k-step ahead prediction, used if the benchmark function performs prediction.
                 pred_step: int = 1, # Step size for k-step ahead prediction, used if the benchmark function performs prediction.
                 data_root_func: Callable[[], Path] = get_default_data_root # Function that returns the root directory where datasets are stored.
                ):
        # Standard attribute assignment
        self.name = name
        self.dataset_id = dataset_id
        self.u_cols = u_cols
        self.y_cols = y_cols
        self.metric_func = metric_func # Now mandatory
        self.x_cols = x_cols
        self.sampling_time = sampling_time
        self.download_func = download_func
        self.test_sim_func = test_sim_func
        self.test_pred_func = test_pred_func
        self.init_window = init_window
        self.pred_horizon = pred_horizon
        self.pred_step = pred_step
        self.data_root_func = data_root_func

    @property
    def data_root(self) -> Path:
        """Returns the evaluated data root path."""
        return self.data_root_func() 

    @property
    def dataset_path(self) -> Path:
        """Returns the full path to the dataset directory."""
        return self.data_root / self.dataset_id

    def ensure_dataset_exists(self, force_download: bool = False) -> None:
        """
        Checks if the dataset exists locally, downloads it if not or if forced.

        Args:
            force_download: If True, download the dataset even if it exists locally.
        """
        dataset_path = self.dataset_path 
        download_func = self.download_func 
        if download_func is None:
            print(f"Warning: No download function specified for benchmark '{self.name}'. Cannot ensure data exists at {dataset_path}")
            # Check existence even if we can't download
            if not dataset_path.is_dir():
                 print(f"Warning: Dataset directory {dataset_path} not found.")
            return

        dataset_exists = dataset_path.is_dir()

        if not dataset_exists or force_download:
            print(f"Dataset for '{self.name}' {'not found' if not dataset_exists else 'download forced'}. Preparing dataset at {dataset_path}...")
            self.data_root.mkdir(parents=True, exist_ok=True) 
            try:
                download_func(dataset_path, force_download) 
                print(f"Dataset '{self.name}' prepared successfully.")
            except Exception as e:
                print(f"Error preparing dataset '{self.name}': {e}")
                raise
        else:
             # Optionally print message if dataset already exists and not forced
             # print(f"Dataset for '{self.name}' found at {dataset_path}.")
             pass

In [None]:
# Setup shared for BenchmarkSpec Tests
_test_data_dir_spec = Path('./_temp_identibench_data_spec_test')
shutil.rmtree(_test_data_dir_spec, ignore_errors=True) # Clean before tests
def _get_test_data_root_spec(): return _test_data_dir_spec

In [None]:
# Test: BenchmarkSpec basic initialization and defaults
_spec_default = BenchmarkSpec(
    name='_spec_default', dataset_id='_dummy_default',
    u_cols=['u0'], y_cols=['y0'], metric_func=identibench.metrics.rmse, 
    download_func=_dummy_dataset_loader, 
    data_root_func=_get_test_data_root_spec
)
test_eq(_spec_default.init_window, None)
test_eq(_spec_default.pred_horizon, None)
test_eq(_spec_default.pred_step, 1)
test_eq(_spec_default.name, '_spec_default') 

In [None]:
# Test: BenchmarkSpec initialization with prediction-related parameters
_spec_pred_params = BenchmarkSpec(
    name='_spec_pred_params', dataset_id='_dummy_pred_params',
    u_cols=['u0'], y_cols=['y0'], metric_func=identibench.metrics.rmse, 
    download_func=_dummy_dataset_loader, 
    init_window=20, pred_horizon=5, pred_step=2,
    data_root_func=_get_test_data_root_spec
)
test_eq(_spec_pred_params.init_window, 20)
test_eq(_spec_pred_params.pred_horizon, 5)
test_eq(_spec_pred_params.pred_step, 2)

In [None]:
# Test: BenchmarkSpec ensure_dataset_exists - first call (creation)
_spec_ensure = BenchmarkSpec(
    name='_spec_ensure', dataset_id='_dummy_ensure',
    u_cols=['u0'], y_cols=['y0'], metric_func=identibench.metrics.rmse, 
    download_func=_dummy_dataset_loader, 
    data_root_func=_get_test_data_root_spec
)
_spec_ensure.ensure_dataset_exists()
_dataset_path_ensure = _spec_ensure.dataset_path
test_eq(_dataset_path_ensure.is_dir(), True)
test_eq((_dataset_path_ensure / 'train' / 'train_0.hdf5').is_file(), True)

Dataset for '_spec_ensure' not found. Preparing dataset at _temp_identibench_data_spec_test/_dummy_ensure...
Dataset '_spec_ensure' prepared successfully.


In [None]:
# Test: BenchmarkSpec ensure_dataset_exists - second call (skip)
_mtime_before_skip = (_dataset_path_ensure / 'train' / 'train_0.hdf5').stat().st_mtime
time.sleep(0.1) 
_spec_ensure.ensure_dataset_exists() 
_mtime_after_skip = (_dataset_path_ensure / 'train' / 'train_0.hdf5').stat().st_mtime
test_eq(_mtime_before_skip, _mtime_after_skip)

In [None]:
# Test: BenchmarkSpec ensure_dataset_exists - third call (force_download=True)
_mtime_before_force = (_dataset_path_ensure / 'train' / 'train_0.hdf5').stat().st_mtime
time.sleep(0.1) 
_spec_ensure.ensure_dataset_exists(force_download=True) 
_mtime_after_force = (_dataset_path_ensure / 'train' / 'train_0.hdf5').stat().st_mtime
test_ne(_mtime_before_force, _mtime_after_force)

Dataset for '_spec_ensure' download forced. Preparing dataset at _temp_identibench_data_spec_test/_dummy_ensure...
Dataset '_spec_ensure' prepared successfully.


## Training Context

In [None]:
#| export
class TrainingContext:
    """
    Context object passed to the user's training function (`build_predictor`).

    Holds the benchmark specification, hyperparameters, and seed.
    Provides methods to access the raw, full-length training and validation data sequences.
    Windowing/batching for training must be handled within the user's `build_predictor` function.
    """
    # Explicit __init__ for nbdev documentation compatibility
    def __init__(self, 
                 spec: BenchmarkSpec, # The benchmark specification.
                 hyperparameters: Dict[str, Any], # User-provided dictionary containing model and training hyperparameters.
                 seed: Optional[int] = None # Optional random seed for reproducibility.
                ):
        # Standard attribute assignment
        self.spec = spec
        self.hyperparameters = hyperparameters
        self.seed = seed

    # --- Data Access Methods ---

    def _get_file_paths(self, subset: str) -> List[Path]:
        """Gets sorted list of HDF5 files for a given subset directory."""
        subset_path = self.spec.dataset_path / subset
        if not subset_path.is_dir():
            return []
        return sorted(list(subset_path.glob('*.hdf5')))

    def _get_sequences_from_subset(self, subset: str
                                  ) -> Iterator[Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]]:
        """Loads raw sequences for a specific subset directory."""
        file_paths = self._get_file_paths(subset)
        if not file_paths:
             print(f"Warning: No HDF5 files found in {self.spec.dataset_path / subset}. Returning empty iterator.")
             return iter([])

        return _load_sequences_from_files(
            file_paths=file_paths,
            u_cols=self.spec.u_cols,
            y_cols=self.spec.y_cols,
            x_cols=self.spec.x_cols,
        )

    def get_train_sequences(self) -> Iterator[Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]]:
        """Returns a lazy iterator yielding raw (u, y, x) tuples for the 'train' subset."""
        return self._get_sequences_from_subset('train')

    def get_valid_sequences(self) -> Iterator[Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]]:
        """Returns a lazy iterator yielding raw (u, y, x) tuples for the 'valid' subset."""
        return self._get_sequences_from_subset('valid')

    def get_train_valid_sequences(self) -> Iterator[Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]]:
        """
        Returns a lazy iterator yielding raw (u, y, x) tuples for combined training and validation.

        Checks for a 'train_valid' subset directory first. If it exists, loads data from there.
        If not, it loads data from 'train' and 'valid' subsets sequentially.
        """
        train_valid_files = self._get_file_paths('train_valid')
        if train_valid_files:
            return _load_sequences_from_files(
                file_paths=train_valid_files, u_cols=self.spec.u_cols, y_cols=self.spec.y_cols,
                x_cols=self.spec.x_cols
            )
        else:
            train_iter = self._get_sequences_from_subset('train')
            valid_iter = self._get_sequences_from_subset('valid')
            return itertools.chain(train_iter, valid_iter)

In [None]:
# Setup shared for TrainingContext Tests
_test_data_dir_ctx = Path('./_temp_identibench_data_ctx_test')
_test_data_dir_ctx_tv = Path('./_temp_identibench_data_ctx_tv_test') 
shutil.rmtree(_test_data_dir_ctx, ignore_errors=True)
shutil.rmtree(_test_data_dir_ctx_tv, ignore_errors=True)
def _get_test_data_root_ctx(): return _test_data_dir_ctx
def _get_test_data_root_ctx_tv(): return _test_data_dir_ctx_tv

In [None]:
# Create base dummy data (no train_valid dir) 
_dummy_spec_ctx_base = BenchmarkSpec(
    name='_dummy_ctx_base', dataset_id='_dummy_dataset_ctx_base',
    u_cols=['u0', 'u1'], y_cols=['y0'],metric_func=identibench.metrics.rmse, 
    download_func=lambda p, f: _dummy_dataset_loader(p, force_download=f, create_train_valid_dir=False),
    data_root_func=_get_test_data_root_ctx, init_window=10
)
_dummy_spec_ctx_base.ensure_dataset_exists()

Dataset for '_dummy_ctx_base' not found. Preparing dataset at _temp_identibench_data_ctx_test/_dummy_dataset_ctx_base...
Dataset '_dummy_ctx_base' prepared successfully.


In [None]:
# Create dummy data WITH train_valid dir 
_dummy_spec_ctx_tv = BenchmarkSpec(
    name='_dummy_ctx_tv', dataset_id='_dummy_dataset_ctx_tv',
    u_cols=['u0', 'u1'], y_cols=['y0'],metric_func=identibench.metrics.rmse, 
    download_func=lambda p, f: _dummy_dataset_loader(p, force_download=f, create_train_valid_dir=True),
    data_root_func=_get_test_data_root_ctx_tv, init_window=10
)
_dummy_spec_ctx_tv.ensure_dataset_exists()

Dataset for '_dummy_ctx_tv' not found. Preparing dataset at _temp_identibench_data_ctx_tv_test/_dummy_dataset_ctx_tv...
Dataset '_dummy_ctx_tv' prepared successfully.


In [None]:
#| hide
# Shared constants for tests
_seq_len_ctx = 50 
_n_files_train_valid_ctx = 2 
_n_files_tv_dir_ctx = 1 
_hyperparams_ctx = {'lr': 0.01, 'hidden': 64}
_seed_ctx = 42

# %% ../nbs/benchmark.ipynb 16
# Test: TrainingContext initialization
_ctx = TrainingContext(spec=_dummy_spec_ctx_base, hyperparameters=_hyperparams_ctx, seed=_seed_ctx)
test_eq(_ctx.spec, _dummy_spec_ctx_base)
test_eq(_ctx.hyperparameters, _hyperparams_ctx)
test_eq(_ctx.seed, _seed_ctx)

# %% ../nbs/benchmark.ipynb 17
# Test: TrainingContext get_train_sequences
_ctx = TrainingContext(spec=_dummy_spec_ctx_base, hyperparameters=_hyperparams_ctx, seed=_seed_ctx)
_train_sequences = list(_ctx.get_train_sequences())
test_eq(len(_train_sequences), _n_files_train_valid_ctx) 
_u_train, _y_train, _x_train = _train_sequences[0]
test_eq(_u_train.shape, (_seq_len_ctx, len(_dummy_spec_ctx_base.u_cols))) 
test_eq(_y_train.shape, (_seq_len_ctx, len(_dummy_spec_ctx_base.y_cols)))
test_eq(_x_train, None)
test_eq(_u_train.dtype, np.float32)

# %% ../nbs/benchmark.ipynb 18
# Test: TrainingContext get_valid_sequences
_ctx = TrainingContext(spec=_dummy_spec_ctx_base, hyperparameters=_hyperparams_ctx, seed=_seed_ctx)
_valid_sequences = list(_ctx.get_valid_sequences())
test_eq(len(_valid_sequences), _n_files_train_valid_ctx)
_u_valid, _y_valid, _x_valid = _valid_sequences[0]
test_eq(_u_valid.shape, (_seq_len_ctx, len(_dummy_spec_ctx_base.u_cols)))

# %% ../nbs/benchmark.ipynb 19
# Test: TrainingContext get_train_valid_sequences - fallback (no train_valid dir)
_ctx_tv_fallback = TrainingContext(spec=_dummy_spec_ctx_base, hyperparameters=_hyperparams_ctx, seed=_seed_ctx)
_tv_sequences_fallback = list(_ctx_tv_fallback.get_train_valid_sequences())
test_eq(len(_tv_sequences_fallback), _n_files_train_valid_ctx + _n_files_train_valid_ctx)
_u_tv_fb_train, _y_tv_fb_train, _ = _tv_sequences_fallback[0] 
test_eq(_u_tv_fb_train.shape[0], _seq_len_ctx)
_u_tv_fb_valid, _y_tv_fb_valid, _ = _tv_sequences_fallback[_n_files_train_valid_ctx] 
test_eq(_u_tv_fb_valid.shape[0], _seq_len_ctx)

# %% ../nbs/benchmark.ipynb 20
# Test: TrainingContext get_train_valid_sequences - direct (train_valid dir exists)
_ctx_tv_direct = TrainingContext(spec=_dummy_spec_ctx_tv, hyperparameters=_hyperparams_ctx, seed=_seed_ctx)
_tv_sequences_direct = list(_ctx_tv_direct.get_train_valid_sequences())
test_eq(len(_tv_sequences_direct), _n_files_tv_dir_ctx)
_u_tv_direct, _y_tv_direct, _ = _tv_sequences_direct[0]
test_eq(_u_tv_direct.shape[0], _seq_len_ctx)

## Benchmark Runtime

In [None]:
#| export
def run_benchmark(spec, build_model, hyperparameters={}, seed=None):

    if seed is None:
        seed = random.randint(0, 2**32 - 1)
    
    results = {
        'benchmark_name': spec.name,
        'dataset_id': spec.dataset_id,
        'hyperparameters': hyperparameters,
        'seed': seed,
        'training_time_seconds': np.nan,
        'test_time_seconds': np.nan,
    }

    spec.ensure_dataset_exists() 

    context = TrainingContext(spec=spec, hyperparameters=hyperparameters, seed=seed) 

    train_start_time = time.monotonic()
    model = build_model(context) 
    train_end_time = time.monotonic()
    results['training_time_seconds'] = train_end_time - train_start_time

    if model is None:
        raise RuntimeError(f"build_model for {spec.name} did not return a model.") 
        
    test_start_time = time.monotonic()
    test_results = spec.test_sim_func(spec, model)
    test_end_time = time.monotonic()

    results['test_time_seconds'] = test_end_time - test_start_time
    
    results.update(test_results) # Merge test results
        
    return results

In [None]:
# Define the BenchmarkSpec matching the dummy data columns
example_spec = BenchmarkSpec(
    name="DummyLoaderExampleBench",
    dataset_id="dummy_core_data_v1", 
    download_func=_dummy_dataset_loader,
    u_cols=['u0', 'u1'],
    y_cols=['y0'], 
    metric_func=identibench.metrics.rmse
)

In [None]:
#| hide
# Define a very simple build_model function for the example
def simple_build_model(context):
    print(f"Building model with spec: {context.spec.name}, seed: {context.seed}")

    def dummy_model(u_test):
        output_dim = len(context.spec.y_cols) 
        return np.zeros((u_test.shape[0], output_dim))
        
    return dummy_model # Return the callable model

In [None]:
# Example usage of run_benchmark
hyperparams = {'learning_rate': 0.01, 'epochs': 5} # Example hyperparameters

results = run_benchmark(
    spec=example_spec, 
    build_model=simple_build_model,
    hyperparameters=hyperparams
)

print("\nBenchmark Results:")
print(results)

Building model with spec: DummyLoaderExampleBench, seed: 3475062729

Benchmark Results:
{'benchmark_name': 'DummyLoaderExampleBench', 'dataset_id': 'dummy_core_data_v1', 'hyperparameters': {'learning_rate': 0.01, 'epochs': 5}, 'seed': 3475062729, 'training_time_seconds': 0.00010208300955127925, 'test_time_seconds': 0.0012477500131353736, 'metric_score': 0.5632057005902633}


In [None]:
def custom_test_logic(spec, model):
    test_dir = spec.dataset_path / 'test'
    test_files = sorted(list(test_dir.glob('*.hdf5'))) 
    max_errors = []
    for u_test, y_test, _ in _load_sequences_from_files(test_files, spec.u_cols, spec.y_cols, spec.x_cols):
        y_pred = model(u_test)
        max_errors.append(np.max(np.abs(y_test - y_pred)))

    avg_max_error = np.mean(max_errors) if max_errors else np.nan
    median_max_error = np.median(max_errors) if max_errors else np.nan
    return {'avg_max_abs_error': avg_max_error, 'median_max_abs_error': median_max_error} # Return results as dict

In [None]:
spec_with_custom_test = BenchmarkSpec(
    name="CustomTestExampleBench",
    dataset_id="dummy_core_data_v1", # Same dataset ID as before
    download_func=_dummy_dataset_loader, 
    u_cols=['u0', 'u1'], 
    y_cols=['y0'],      
    test_sim_func=custom_test_logic,
    metric_func=identibench.metrics.rmse
)

In [None]:
# Run benchmark using the spec with the custom test function
hyperparams = {'model_type': 'dummy_v2'} 

results_custom_test = run_benchmark(
    spec=spec_with_custom_test, 
    build_model=simple_build_model,
    hyperparameters=hyperparams
)

print("\nBenchmark Results (Custom Test Example):")
print(results_custom_test)

# Note: The result dictionary now contains 'avg_max_abs_error' instead of 'metric_score'

Building model with spec: CustomTestExampleBench, seed: 2049681544

Benchmark Results (Custom Test Example):
{'benchmark_name': 'CustomTestExampleBench', 'dataset_id': 'dummy_core_data_v1', 'hyperparameters': {'model_type': 'dummy_v2'}, 'seed': 2049681544, 'training_time_seconds': 0.00013220800610724837, 'test_time_seconds': 0.0012791250046575442, 'avg_max_abs_error': np.float64(0.9878208935260773), 'median_max_abs_error': np.float64(0.9878208935260773)}


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()