# M-Competition Evaluation

This notebook evaluates ADAM and ES models on M1 and M3 competition datasets.

In [2]:
import numpy as np
import pandas as pd
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import warnings
warnings.filterwarnings('ignore')

from mcomp import M1, M3, load_m1, load_m3
from smooth import ADAM, ES

## Error Metrics

In [3]:
def RMSSE(holdout, forecast, actuals):
    """
    Root Mean Squared Scaled Error.
    
    Parameters
    ----------
    holdout : array-like
        Actual holdout values
    forecast : array-like
        Forecasted values
    actuals : array-like
        In-sample actual values (for scaling)
    
    Returns
    -------
    float
        RMSSE value
    """
    holdout = np.asarray(holdout)
    forecast = np.asarray(forecast)
    actuals = np.asarray(actuals)
    
    mse = np.mean((holdout - forecast) ** 2)
    scale = np.mean(np.diff(actuals) ** 2)
    
    if scale == 0:
        return np.nan
    
    return np.sqrt(mse / scale)

def SAME(holdout, forecast, actuals):
    """
    Scaled Absolute Mean Error.
    
    Parameters
    ----------
    holdout : array-like
        Actual holdout values
    forecast : array-like
        Forecasted values
    actuals : array-like
        In-sample actual values (for scaling)
    
    Returns
    -------
    float
        RMSSE value
    """
    holdout = np.asarray(holdout)
    forecast = np.asarray(forecast)
    actuals = np.asarray(actuals)
    
    ame = np.abs(np.mean(holdout - forecast))
    scale = np.mean(np.abs(np.diff(actuals)))
    
    if scale == 0:
        return np.nan
    
    return ame / scale

## Load Datasets

In [4]:
# Load M1 and M3 datasets
m1 = load_m1()
m3 = load_m3()

# Combine datasets into a list
datasets = []
for idx in m1.keys():
    datasets.append(m1[idx])
for idx in m3.keys():
    datasets.append(m3[idx])

print(f"Total series: {len(datasets)}")
print(f"M1: {len(m1)} series")
print(f"M3: {len(m3)} series")

Loaded M1 dataset: 1001 series
Loaded M3 dataset: 3003 series
Total series: 4004
M1: 1001 series
M3: 3003 series


## Define Methods

In [5]:
# Method names
methods_names = [
    "ADAM ETS Back",
    "ADAM ETS Opt", 
    "ADAM ETS Two",
    "ES Back",
    "ES Opt",
    "ES Two"
]

methods_number = len(methods_names)
dataset_length = len(datasets)

print(f"Methods: {methods_number}")
print(f"Datasets: {dataset_length}")

Methods: 6
Datasets: 4004


## Evaluation Functions

In [6]:
def evaluate_single_series(series, method_name):
    """
    Evaluate a single method on a single series.
    
    Parameters
    ----------
    series : MCompSeries
        Series to evaluate
    method_name : str
        Name of the method to use
    
    Returns
    -------
    tuple
        (RMSSE, SAME, time_elapsed)
    """
    try:
        start_time = time.time()
        
        # Determine lags and model based on period
        period = series.period
        if period > 1:
            lags = [1, period]
            model_str = "ZXZ"  # Auto-select including seasonality
        else:
            lags = [1]
            model_str = "ZXN"  # Auto-select without seasonality for non-seasonal data
        
        # Select model class based on method
        if "ADAM" in method_name:
            model_class = ADAM
        else:
            model_class = ES
        
        if "Back" in method_name:
            initial = "backcasting"
        elif "Opt" in method_name:
            initial = "optimal"
        elif "Two" in method_name:
            initial = "two-stage"
        else:
            initial = "backcasting"
        
        # Create and fit model
        model = model_class(model=model_str, lags=lags, initial=initial)
        model.fit(series.x)
        
        # Generate forecasts
        forecasts = model.predict(h=series.h)
        forecast_values = forecasts['mean'].values
        
        time_elapsed = time.time() - start_time
        
        # Calculate RMSSE
        rmsse = RMSSE(series.xx, forecast_values, series.x)
        same = SAME(series.xx, forecast_values, series.x)
        
        return (rmsse, same, time_elapsed)
    
    except Exception as e:
        return (np.nan, np.nan, np.nan)


def evaluate_method_sequential(datasets, method_name, verbose=True):
    """
    Evaluate a method on all datasets sequentially.
    
    Parameters
    ----------
    datasets : list
        List of MCompSeries
    method_name : str
        Name of the method
    verbose : bool
        Whether to print progress
    
    Returns
    -------
    tuple
        Arrays of (RMSSE values, time values)
    """
    n = len(datasets)
    rmsse_values = np.full(n, np.nan)
    same_values = np.full(n, np.nan)
    time_values = np.full(n, np.nan)
    
    for i, series in enumerate(datasets):
        if verbose and (i + 1) % 100 == 0:
            print(f"  {method_name}: {i + 1}/{n}")
        
        rmsse, same, elapsed = evaluate_single_series(series, method_name)
        rmsse_values[i] = rmsse
        same_values[i] = same
        time_values[i] = elapsed
    
    return rmsse_values, same_values, time_values

## Run Evaluation

This may take a while depending on the number of series.

In [7]:
# First, test on a small subset to make sure everything works
test_datasets = datasets[:10]

print("Testing on first 10 series...")
for method in methods_names[:2]:  # Test first 2 methods
    rmsse_vals, same_vals, time_vals = evaluate_method_sequential(test_datasets, method, verbose=False)
    print(f"{method}: Mean RMSSE = {np.nanmean(rmsse_vals):.4f}, SAME = {np.nanmean(same_vals):.4f}, Time = {np.nanmean(time_vals):.3f}s")

Testing on first 10 series...
ADAM ETS Back: Mean RMSSE = 5.9556, SAME = 6.4621, Time = 0.058s
ADAM ETS Opt: Mean RMSSE = 6.4098, SAME = 6.9332, Time = 0.123s


In [8]:
# Initialize results array
# Shape: (methods, datasets, metrics) where metrics = [RMSSE, SAME, Time]
test_results = np.full((methods_number, dataset_length, 3), np.nan)

print(f"Results array shape: {test_results.shape}")
print(f"Methods: {methods_names}")

Results array shape: (6, 4004, 3)
Methods: ['ADAM ETS Back', 'ADAM ETS Opt', 'ADAM ETS Two', 'ES Back', 'ES Opt', 'ES Two']


In [9]:
# Run full evaluation sequentially (alternative to parallel)
# Skip this cell if using parallel evaluation above

# for j, method_name in enumerate(methods_names):
#     print(f"\nEvaluating {method_name} ({j+1}/{methods_number})...")
#     start = time.time()
#     
#     rmsse_values, same_values, time_values = evaluate_method_sequential(datasets, method_name)
#     
#     test_results[j, :, 0] = rmsse_values
#     test_results[j, :, 1] = same_values
#     test_results[j, :, 2] = time_values
#     
#     total_time = time.time() - start
#     print(f"  Completed in {total_time:.1f}s")
#     print(f"  Mean RMSSE: {np.nanmean(rmsse_values):.4f}")
#     print(f"  Mean SAME: {np.nanmean(same_values):.4f}")
#     print(f"  Mean Time per series: {np.nanmean(time_values):.3f}s")

## Parallel Evaluation

Run evaluation using all CPU cores for faster processing.

In [10]:
def _evaluate_task(args):
    """
    Worker function for parallel evaluation.
    Must be defined at module level for pickling.
    
    Parameters
    ----------
    args : tuple
        (series_idx, series_data, method_name) where series_data is a dict
        containing the series attributes needed for evaluation
    
    Returns
    -------
    tuple
        (series_idx, method_name, rmsse, same, time_elapsed)
    """
    import numpy as np
    import time
    from smooth import ADAM, ES
    
    series_idx, series_data, method_name = args
    
    try:
        start_time = time.time()
        
        # Reconstruct series data
        x = series_data['x']
        xx = series_data['xx']
        h = series_data['h']
        period = series_data['period']
        
        # Determine lags and model based on period
        if period > 1:
            lags = [1, period]
            model_str = "ZXZ"
        else:
            lags = [1]
            model_str = "ZXN"
        
        # Select model class based on method
        if "ADAM" in method_name:
            model_class = ADAM
        else:
            model_class = ES
        
        if "Back" in method_name:
            initial = "backcasting"
        elif "Opt" in method_name:
            initial = "optimal"
        elif "Two" in method_name:
            initial = "two-stage"
        else:
            initial = "backcasting"
        
        # Create and fit model
        model = model_class(model=model_str, lags=lags, initial=initial)
        model.fit(x)
        
        # Generate forecasts
        forecasts = model.predict(h=h)
        forecast_values = forecasts['mean'].values
        
        time_elapsed = time.time() - start_time
        
        # Calculate metrics
        holdout = np.asarray(xx)
        actuals = np.asarray(x)
        
        # RMSSE
        mse = np.mean((holdout - forecast_values) ** 2)
        scale = np.mean(np.diff(actuals) ** 2)
        rmsse = np.sqrt(mse / scale) if scale != 0 else np.nan
        
        # SAME
        ame = np.abs(np.mean(holdout - forecast_values))
        scale_same = np.mean(np.abs(np.diff(actuals)))
        same = ame / scale_same if scale_same != 0 else np.nan
        
        return (series_idx, method_name, rmsse, same, time_elapsed)
    
    except Exception as e:
        return (series_idx, method_name, np.nan, np.nan, np.nan)


def evaluate_parallel(datasets, methods_names, n_workers=None):
    """
    Evaluate all methods on all datasets in parallel.
    
    Parameters
    ----------
    datasets : list
        List of MCompSeries objects
    methods_names : list
        List of method names to evaluate
    n_workers : int, optional
        Number of parallel workers. Defaults to all CPU cores.
    
    Returns
    -------
    np.ndarray
        Results array of shape (n_methods, n_datasets, 3) containing
        [RMSSE, SAME, time] for each method-dataset combination
    """
    if n_workers is None:
        n_workers = multiprocessing.cpu_count()
    
    n_methods = len(methods_names)
    n_datasets = len(datasets)
    
    # Initialize results array
    results = np.full((n_methods, n_datasets, 3), np.nan)
    
    # Prepare tasks: convert series to picklable dicts
    tasks = []
    for j, method_name in enumerate(methods_names):
        for i, series in enumerate(datasets):
            series_data = {
                'x': np.asarray(series.x),
                'xx': np.asarray(series.xx),
                'h': series.h,
                'period': series.period
            }
            tasks.append((i, series_data, method_name))
    
    print(f"Starting parallel evaluation with {n_workers} workers...")
    print(f"Total tasks: {len(tasks)} ({n_methods} methods × {n_datasets} series)")
    
    start_time = time.time()
    completed = 0
    
    with ProcessPoolExecutor(max_workers=n_workers) as executor:
        futures = {executor.submit(_evaluate_task, task): task for task in tasks}
        
        for future in as_completed(futures):
            result = future.result()
            series_idx, method_name, rmsse, same, elapsed = result
            
            # Find method index
            method_idx = methods_names.index(method_name)
            
            # Store results
            results[method_idx, series_idx, 0] = rmsse
            results[method_idx, series_idx, 1] = same
            results[method_idx, series_idx, 2] = elapsed
            
            completed += 1
            if completed % 1000 == 0:
                elapsed_total = time.time() - start_time
                rate = completed / elapsed_total
                remaining = (len(tasks) - completed) / rate
                print(f"  Progress: {completed}/{len(tasks)} ({100*completed/len(tasks):.1f}%) - "
                      f"ETA: {remaining/60:.1f} min")
    
    total_time = time.time() - start_time
    print(f"\nCompleted in {total_time/60:.1f} minutes ({total_time:.1f}s)")
    print(f"Average time per task: {total_time/len(tasks)*1000:.1f}ms")
    
    return results

In [11]:
# Run parallel evaluation using all CPU cores
# This is much faster than sequential evaluation

print(f"Available CPU cores: {multiprocessing.cpu_count()}")

# Run parallel evaluation
test_results = evaluate_parallel(datasets, methods_names)

# Print summary
print("\nPer-method summary:")
for j, method in enumerate(methods_names):
    rmsse_mean = np.nanmean(test_results[j, :, 0])
    same_mean = np.nanmean(test_results[j, :, 1])
    time_mean = np.nanmean(test_results[j, :, 2])
    failed = np.sum(np.isnan(test_results[j, :, 0]))
    print(f"  {method}: RMSSE={rmsse_mean:.4f}, SAME={same_mean:.4f}, "
          f"Time={time_mean:.3f}s, Failed={failed}")

np.save('2026-01-18-Mcomp-test.npy', test_results)

Available CPU cores: 32
Starting parallel evaluation with 32 workers...
Total tasks: 24024 (6 methods × 4004 series)
  Progress: 1000/24024 (4.2%) - ETA: 9.3 min
  Progress: 2000/24024 (8.3%) - ETA: 6.3 min
  Progress: 3000/24024 (12.5%) - ETA: 7.4 min
  Progress: 4000/24024 (16.7%) - ETA: 8.0 min
  Progress: 5000/24024 (20.8%) - ETA: 10.4 min
  Progress: 6000/24024 (25.0%) - ETA: 9.4 min
  Progress: 7000/24024 (29.1%) - ETA: 10.9 min
  Progress: 8000/24024 (33.3%) - ETA: 12.0 min
  Progress: 9000/24024 (37.5%) - ETA: 11.9 min
  Progress: 10000/24024 (41.6%) - ETA: 10.5 min
  Progress: 11000/24024 (45.8%) - ETA: 10.5 min
  Progress: 12000/24024 (50.0%) - ETA: 10.4 min
  Progress: 13000/24024 (54.1%) - ETA: 9.2 min
  Progress: 14000/24024 (58.3%) - ETA: 7.9 min
  Progress: 15000/24024 (62.4%) - ETA: 6.9 min
  Progress: 16000/24024 (66.6%) - ETA: 6.0 min
  Progress: 17000/24024 (70.8%) - ETA: 5.4 min
  Progress: 18000/24024 (74.9%) - ETA: 4.5 min
  Progress: 19000/24024 (79.1%) - ETA: 3.

## Results Summary

In [None]:
# Create summary DataFrame
summary = pd.DataFrame({
    'Method': methods_names,
    'Mean RMSSE': [np.nanmean(test_results[j, :, 0]) for j in range(methods_number)],
    'Median RMSSE': [np.nanmedian(test_results[j, :, 0]) for j in range(methods_number)],
    'Maximum RMSSE': [np.nanmax(test_results[j, :, 0]) for j in range(methods_number)],
    'Mean SAME': [np.nanmean(test_results[j, :, 1]) for j in range(methods_number)],
    'Median SAME': [np.nanmedian(test_results[j, :, 1]) for j in range(methods_number)],
    'Mean Time (s)': [np.nanmean(test_results[j, :, 2]) for j in range(methods_number)],
    'Total Time (s)': [np.nansum(test_results[j, :, 2]) for j in range(methods_number)],
    'Failed': [np.sum(np.isnan(test_results[j, :, 0])) for j in range(methods_number)]
})

print(np.where(np.isnan(test_results[0, :, 0])))

print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(summary.to_string(index=False))

(array([253, 254, 255, 256, 257, 516, 517, 518, 549, 551, 552, 604, 605,
       606, 607, 608, 609, 610, 611, 612, 613, 614, 627, 628, 629, 630,
       840, 841, 855, 858, 868, 873, 973, 974, 975, 976, 986]),)

EVALUATION RESULTS
       Method  Mean RMSSE  Median RMSSE  Maximum RMSSE  Mean SAME  Median SAME  Mean Time (s)  Total Time (s)  Failed
ADAM ETS Back    2.091757      1.242270      50.258736   2.110042     1.084838       0.770531     3056.697307      37
 ADAM ETS Opt    2.082516      1.268069      51.616184   2.097454     1.102463       2.122107     8418.397695      37
 ADAM ETS Two    2.082516      1.268069      51.616184   2.097454     1.102463       2.136846     8476.869047      37
      ES Back    2.091125      1.245573      50.258736   2.111166     1.079962       0.772810     3065.736076      37
       ES Opt    2.085444      1.265223      51.616184   2.101748     1.098140       2.073597     8225.957457      37
       ES Two    2.085444      1.265223      51.616184   2.101

In [None]:
# Results by series type
series_types = [s.type for s in datasets]
unique_types = list(set(series_types))

print("\n" + "="*60)
print("RESULTS BY SERIES TYPE")
print("="*60)

for stype in unique_types:
    mask = np.array([s.type == stype for s in datasets])
    print(f"\n{stype.upper()} ({np.sum(mask)} series):")
    
    for j, method in enumerate(methods_names):
        rmsse_type = test_results[j, mask, 0]
        print(f"  {method}: Mean RMSSE = {np.nanmean(rmsse_type):.4f}")

In [None]:
# Save results
import datetime
import joblib

date_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save as numpy array
np.save(f'test_results_{date_str}.npy', test_results)

# Save summary as CSV
summary.to_csv(f'test_summary_{date_str}.csv', index=False)

# Save complete results with metadata using joblib
results_dict = {
    'test_results': test_results,
    'methods_names': methods_names,
    'dataset_info': [(s.sn, s.type, s.period, len(s.x), s.h) for s in datasets],
    'summary': summary
}
joblib.dump(results_dict, f'test_results_full_{date_str}.joblib')

print(f"Results saved:")
print(f"  - test_results_{date_str}.npy (raw array)")
print(f"  - test_summary_{date_str}.csv (summary table)")
print(f"  - test_results_full_{date_str}.joblib (complete with metadata)")

## Single Series Example

In [None]:
# Test on a single series to see detailed output
series = M3[2568]
print(f"Series: {series}")
print(f"Training length: {len(series.x)}")
print(f"Test length: {len(series.xx)}")
print(f"Period: {series.period}")

# Fit model
model = ES(model="MAM", lags=[1, series.period], initial="optimal")
model.fit(series.x)

print("\n" + str(model))

# Forecast
forecasts = model.predict(h=series.h)
print("\nForecasts vs Actuals:")
comparison = pd.DataFrame({
    'Forecast': forecasts['mean'].values,
    'Actual': series.xx,
    'Error': forecasts['mean'].values - series.xx
})
print(comparison)

# Calculate error metrics
rmsse = RMSSE(series.xx, forecasts['mean'].values, series.x)
print(f"\nRMSSE: {rmsse:.4f}")

same = SAME(series.xx, forecasts['mean'].values, series.x)
print(f"\nSAME: {same:.4f}")

Series: MCompSeries(sn='T1167', n=116, h=18, type='monthly')
Training length: 116
Test length: 18
Period: 12

Time elapsed: 0.17 seconds
Model estimated using ES() function: ETS(MAM)
With optimal initialisation
Distribution assumed in the model: Normal
Loss function type: likelihood; Loss function value: 870.2151
Persistence vector g:
 alpha   beta  gamma
0.0484 0.0080 0.6323
Sample size: 116
Number of estimated parameters: 16
Number of degrees of freedom: 100
Information criteria:
      AIC      AICc       BIC      BICc
1772.4303 1777.9252 1816.4877 1829.5481

Forecasts vs Actuals:
        Forecast   Actual        Error
0   11045.606724  11818.9  -773.293276
1    7911.136866   7682.9   228.236866
2    7433.207803   7462.9   -29.692197
3   10480.661190  11368.6  -887.938810
4   10535.226776  11271.6  -736.373224
5    6608.593468   6597.9    10.693468
6    7188.648825   8328.8 -1140.151175
7   14266.373490  13201.7  1064.673490
8    6544.223745   7064.2  -519.976255
9   11923.871768  12