# M-Competition Evaluation

This notebook evaluates ADAM and ES models on M1 and M3 competition datasets.

In [1]:
import numpy as np
import pandas as pd
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import warnings
warnings.filterwarnings('ignore')

from mcomp import M1, M3, load_m1, load_m3
from smooth import ADAM, ES

## Error Metrics

In [None]:
def RMSSE(holdout, forecast, actuals):
    """
    Root Mean Squared Scaled Error.
    
    Parameters
    ----------
    holdout : array-like
        Actual holdout values
    forecast : array-like
        Forecasted values
    actuals : array-like
        In-sample actual values (for scaling)
    
    Returns
    -------
    float
        RMSSE value
    """
    holdout = np.asarray(holdout)
    forecast = np.asarray(forecast)
    actuals = np.asarray(actuals)
    
    mse = np.mean((holdout - forecast) ** 2)
    scale = np.mean(np.diff(actuals) ** 2)
    
    if scale == 0:
        return np.nan
    
    return np.sqrt(mse / scale)

def SAME(holdout, forecast, actuals):
    """
    Scaled Absolute Mean Error.
    
    Parameters
    ----------
    holdout : array-like
        Actual holdout values
    forecast : array-like
        Forecasted values
    actuals : array-like
        In-sample actual values (for scaling)
    
    Returns
    -------
    float
        RMSSE value
    """
    holdout = np.asarray(holdout)
    forecast = np.asarray(forecast)
    actuals = np.asarray(actuals)
    
    ame = np.abs(np.mean(holdout - forecast))
    scale = np.mean(np.abs(np.diff(actuals)))
    
    if scale == 0:
        return np.nan
    
    return ame / scale

## Load Datasets

In [3]:
# Load M1 and M3 datasets
m1 = load_m1()
m3 = load_m3()

# Combine datasets into a list
datasets = []
for idx in m1.keys():
    datasets.append(m1[idx])
for idx in m3.keys():
    datasets.append(m3[idx])

print(f"Total series: {len(datasets)}")
print(f"M1: {len(m1)} series")
print(f"M3: {len(m3)} series")

Loaded M1 dataset: 1001 series
Loaded M3 dataset: 3003 series
Total series: 4004
M1: 1001 series
M3: 3003 series


## Define Methods

In [4]:
# Method names
methods_names = [
    "ADAM ETS Back",
    "ADAM ETS Opt", 
    "ADAM ETS Two",
    "ES Back",
    "ES Opt",
    "ES Two"
]

methods_number = len(methods_names)
dataset_length = len(datasets)

print(f"Methods: {methods_number}")
print(f"Datasets: {dataset_length}")

Methods: 6
Datasets: 4004


## Evaluation Functions

In [None]:
def evaluate_single_series(series, method_name):
    """
    Evaluate a single method on a single series.
    
    Parameters
    ----------
    series : MCompSeries
        Series to evaluate
    method_name : str
        Name of the method to use
    
    Returns
    -------
    tuple
        (RMSSE, time_elapsed)
    """
    try:
        start_time = time.time()
        
        # Determine lags based on period
        period = series.period
        if period > 1:
            lags = [1, period]
        else:
            lags = [1]
        
        # Select model and initial based on method
        if "ADAM" in method_name:
            model_class = ADAM
        else:
            model_class = ES
        
        if "Back" in method_name:
            initial = "backcasting"
        elif "Opt" in method_name:
            initial = "optimal"
        elif "Two" in method_name:
            initial = "two-stage"
        else:
            initial = "backcasting"
        
        # Create and fit model
        model = model_class(model="ZXZ", lags=lags, initial=initial)
        model.fit(series.x)
        
        # Generate forecasts
        forecasts = model.predict(h=series.h)
        forecast_values = forecasts['mean'].values
        
        time_elapsed = time.time() - start_time
        
        # Calculate RMSSE
        rmsse = RMSSE(series.xx, forecast_values, series.x)
        same = SAME(series.xx, forecast_values, series.x)
        
        return (rmsse, same, time_elapsed)
    
    except Exception as e:
        return (np.nan, np.nan)


def evaluate_method_sequential(datasets, method_name, verbose=True):
    """
    Evaluate a method on all datasets sequentially.
    
    Parameters
    ----------
    datasets : list
        List of MCompSeries
    method_name : str
        Name of the method
    verbose : bool
        Whether to print progress
    
    Returns
    -------
    tuple
        Arrays of (RMSSE values, time values)
    """
    n = len(datasets)
    rmsse_values = np.full(n, np.nan)
    same_values = np.full(n, np.nan)
    time_values = np.full(n, np.nan)
    
    for i, series in enumerate(datasets):
        if verbose and (i + 1) % 100 == 0:
            print(f"  {method_name}: {i + 1}/{n}")
        
        rmsse, same, elapsed = evaluate_single_series(series, method_name)
        rmsse_values[i] = rmsse
        same_values[i] = same
        time_values[i] = elapsed
    
    return rmsse_values, same_values, time_values

## Run Evaluation

This may take a while depending on the number of series.

In [None]:
# First, test on a small subset to make sure everything works
test_datasets = datasets[:10]

print("Testing on first 10 series...")
for method in methods_names[:2]:  # Test first 2 methods
    rmsse_vals, same_vals, time_vals = evaluate_method_sequential(test_datasets, method, verbose=False)
    print(f"{method}: Mean RMSSE = {np.nanmean(rmsse_vals):.4f}, SAME = {np.nanmean(same_vals):.4f}, Time = {np.nanmean(time_vals):.3f}s")

In [None]:
# Initialize results array
# Shape: (methods, datasets, metrics) where metrics = [RMSSE, Time]
test_results = np.full((methods_number, dataset_length, 2), np.nan)

print(f"Results array shape: {test_results.shape}")
print(f"Methods: {methods_names}")

In [None]:
# Run full evaluation (this will take a long time)
# Uncomment the following lines to run the full evaluation

for j, method_name in enumerate(methods_names):
    print(f"\nEvaluating {method_name} ({j+1}/{methods_number})...")
    start = time.time()
    
    rmsse_values, same_values, time_values = evaluate_method_sequential(datasets, method_name)
    
    test_results[j, :, 0] = rmsse_values
    test_results[j, :, 1] = same_values
    test_results[j, :, 2] = time_values
    
    total_time = time.time() - start
    print(f"  Completed in {total_time:.1f}s")
    print(f"  Mean RMSSE: {np.nanmean(rmsse_values):.4f}")
    print(f"  Mean SAME: {np.nanmean(same_values):.4f}")
    print(f"  Mean Time per series: {np.nanmean(time_values):.3f}s")

## Results Summary

In [None]:
# Create summary DataFrame
summary = pd.DataFrame({
    'Method': methods_names,
    'Mean RMSSE': [np.nanmean(test_results[j, :, 0]) for j in range(methods_number)],
    'Median RMSSE': [np.nanmedian(test_results[j, :, 0]) for j in range(methods_number)],
    'Mean SAME': [np.nanmean(test_results[j, :, 1]) for j in range(methods_number)],
    'Median SAME': [np.nanmedian(test_results[j, :, 1]) for j in range(methods_number)],
    'Mean Time (s)': [np.nanmean(test_results[j, :, 2]) for j in range(methods_number)],
    'Total Time (s)': [np.nansum(test_results[j, :, 2]) for j in range(methods_number)],
    'Failed': [np.sum(np.isnan(test_results[j, :, 0])) for j in range(methods_number)]
})

print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(summary.to_string(index=False))

In [None]:
# Results by series type
series_types = [s.type for s in datasets]
unique_types = list(set(series_types))

print("\n" + "="*60)
print("RESULTS BY SERIES TYPE")
print("="*60)

for stype in unique_types:
    mask = np.array([s.type == stype for s in datasets])
    print(f"\n{stype.upper()} ({np.sum(mask)} series):")
    
    for j, method in enumerate(methods_names):
        rmsse_type = test_results[j, mask, 0]
        print(f"  {method}: Mean RMSSE = {np.nanmean(rmsse_type):.4f}")

In [None]:
# Save results
import datetime

date_str = datetime.datetime.now().strftime("%Y%m%d")
np.save(f'test_results_{date_str}.npy', test_results)
summary.to_csv(f'test_summary_{date_str}.csv', index=False)

print(f"Results saved to test_results_{date_str}.npy and test_summary_{date_str}.csv")

## Single Series Example

In [None]:
# Test on a single series to see detailed output
series = M3[2568]
print(f"Series: {series}")
print(f"Training length: {len(series.x)}")
print(f"Test length: {len(series.xx)}")
print(f"Period: {series.period}")

# Fit model
model = ES(model="MAM", lags=[1, series.period], initial="optimal")
model.fit(series.x)

print("\n" + str(model))

# Forecast
forecasts = model.predict(h=series.h)
print("\nForecasts vs Actuals:")
comparison = pd.DataFrame({
    'Forecast': forecasts['mean'].values,
    'Actual': series.xx,
    'Error': forecasts['mean'].values - series.xx
})
print(comparison)

# Calculate error metrics
rmsse = RMSSE(series.xx, forecasts['mean'].values, series.x)
print(f"\nRMSSE: {rmsse:.4f}")