## 2.04 · Forecasting metrics overview
Summarize this notebook's purpose—evaluating StatsForecast backtests and visualizing model performance.

## 1. Import evaluation libraries
Bring in NumPy, pandas, plotting, and utilsforecast helpers plus shared column contracts.

In [4]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


# OS & utilities
from pathlib import Path
import sys
import warnings


# Evaluation metrics
from utilsforecast.losses import (
    nd as wmape,
    mae,
    rmse,
    bias,
)
from utilsforecast.evaluation import evaluate


# --- Path Configuration (before local imports) ---
MODULE_DIR = Path().resolve()
PROJECT_ROOT = MODULE_DIR.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# --- Local Imports ---
from src import (
    CacheManager,
    ArtifactManager,
    get_notebook_name,
    find_project_root,
)

# --- Settings ---
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-whitegrid")

# --- Paths ---
PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

# --- Managers ---
NB_NAME = get_notebook_name()
cache = CacheManager(PROJECT_ROOT / ".cache" / NB_NAME)
artifacts = ArtifactManager(PROJECT_ROOT / "artifacts")


# Plot styling
plt.style.use("seaborn-v0_8-whitegrid")

# set name of function 
wmape.__name__ = "wmape"

# data contract 
ID_COL = "unique_id"
TIME_COL = "ds"
TARGET_COL = "y"
FCD_COL = "cutoff"

data_contract_cols = [ID_COL, TIME_COL, TARGET_COL, FCD_COL]

## 2. Load StatsForecast backtest results
Pull the saved rolling-origin predictions for every model and cutoff to drive evaluation.

In [5]:

backtest = artifacts.load(key="statsforecast_backtest")

✓ Loaded 'statsforecast_backtest' from 02_baselines/
   Shape: 1,585,480 × 39


## 3. Compute aggregate performance metrics
Evaluate RMSE, MAE, bias, and wMAPE across all models/cutoffs and keep the MASE denominator accurate by looping per cutoff.

In [6]:
model_cols = [col for col in backtest.columns if col not in 
data_contract_cols and 'lo-' not in col and 'hi-' not in col]

# this outputs a long form df with all metrics x all_models x all cutoffs x all ids
metric_df = evaluate(
    df = backtest,
    metrics=[rmse,mae,bias,wmape],
    id_col=ID_COL,
    time_col=TIME_COL,
    target_col=TARGET_COL,
    models=model_cols,
)


# replace infs with nan
metric_df = metric_df.replace([np.inf, -np.inf], np.nan)

In [7]:
metric_df.head()

Unnamed: 0,unique_id,metric,Naive,Croston,MA4,SN52,HW52,AutoTheta,AutoETS
0,FOODS_1_001_CA_1,rmse,2.945009,2.949453,2.996593,4.372114,3.584293,2.894107,2.89678
1,FOODS_1_001_CA_2,rmse,7.244361,4.798632,4.881652,6.238836,5.423531,5.020083,5.025895
2,FOODS_1_001_CA_3,rmse,11.865431,5.885474,6.890455,9.908233,7.661091,6.438623,5.911801
3,FOODS_1_001_CA_4,rmse,2.649383,2.085443,2.260393,2.759599,2.300711,2.090987,2.091906
4,FOODS_1_001_TX_1,rmse,5.276946,4.389487,4.342046,5.4895,5.167372,4.41155,4.389033


## 4. Plot SKU-level wMAPE distributions
Render comparative boxplots and descriptive statistics to highlight dispersion across models.

In [9]:
import plotly.express as px

METRIC = "wmape"
metric_labels = {
    "wmape": "wMAPE",
    "mae": "Mean Absolute Error",
    "rmse": "Root Mean Squared Error",
    "mase": "Mean Absolute Scaled Error",
}

plot_data = metric_df.query("metric == @METRIC").melt(
    id_vars=[ID_COL, "metric"],
    var_name="model",
    value_name=METRIC,
)

n_series = plot_data[ID_COL].nunique()

fig = px.box(
    plot_data,
    x=METRIC,
    y="model",
    color="model",
    color_discrete_sequence=px.colors.qualitative.Set2,
    points=False,  # hide fliers
)

fig.update_layout(
    height=500,
    title=f"{metric_labels.get(METRIC, METRIC.upper())} Distribution Across Models",
    xaxis_title=metric_labels.get(METRIC, METRIC.upper()),
    yaxis_title="Model",
    font=dict(size=12, family="IBM Plex Sans, sans-serif"),
    margin=dict(l=80, r=40, t=60, b=40),
    showlegend=False,
    plot_bgcolor="#fafafa",
)

fig.update_xaxes(
    zeroline=False,
    showgrid=True,
    gridwidth=0.5,
    gridcolor="rgba(0,0,0,0.15)",
)

fig.add_annotation(
    x=0.98,
    y=0.02,
    xref="paper",
    yref="paper",
    text=f"n = {n_series:,} series",
    showarrow=False,
    font=dict(size=11, color="gray"),
    align="right",
    bgcolor="rgba(255,255,255,0.85)",
    bordercolor="gray",
    borderwidth=1,
    borderpad=6,
)

fig.show()

summary = plot_data.groupby("model")[METRIC].describe().round(3)
display(summary)
print(f"\n📊 {METRIC.upper()} Summary Statistics by Model:")
print(summary)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AutoETS,30490.0,0.712,0.37,0.103,0.459,0.636,0.879,15.221
AutoTheta,30490.0,0.705,0.349,0.099,0.459,0.635,0.872,12.046
Croston,30490.0,0.771,1.124,0.115,0.458,0.654,0.906,142.101
HW52,30490.0,0.896,1.038,0.097,0.561,0.757,1.024,123.7
MA4,30490.0,0.711,0.336,0.094,0.466,0.647,0.888,5.656
Naive,30490.0,0.792,0.396,0.092,0.533,0.72,0.969,8.111
SN52,30490.0,0.957,0.967,0.108,0.626,0.831,1.06,106.44



📊 WMAPE Summary Statistics by Model:
             count   mean    std    min    25%    50%    75%      max
model                                                                
AutoETS    30490.0  0.712  0.370  0.103  0.459  0.636  0.879   15.221
AutoTheta  30490.0  0.705  0.349  0.099  0.459  0.635  0.872   12.046
Croston    30490.0  0.771  1.124  0.115  0.458  0.654  0.906  142.101
HW52       30490.0  0.896  1.038  0.097  0.561  0.757  1.024  123.700
MA4        30490.0  0.711  0.336  0.094  0.466  0.647  0.888    5.656
Naive      30490.0  0.792  0.396  0.092  0.533  0.720  0.969    8.111
SN52       30490.0  0.957  0.967  0.108  0.626  0.831  1.060  106.440


# Output Enriched CV with Error/Abs Error Columns

In [20]:
# melt to long format 
long_backtest = backtest.melt(
id_vars=data_contract_cols,
var_name="model",
value_name="y_pred")

# add error, abs error 
long_backtest = long_backtest.assign(
    error = long_backtest['y_pred'].sub(long_backtest[TARGET_COL]),
    abs_error = lambda x: x['error'].abs()
)


#write to parquet 
artifacts.save(
    df=long_backtest,
    key="enriched_backtest",   
)

✓ Saved 'enriched_backtest' → 02_baselines/
   Data:   output/enriched_backtest.parquet (899.98 MB, 55,491,800 rows)


PosixPath('/Users/jackrodenberg/Desktop/real-world-forecasting-foundations/artifacts/02_baselines/output/enriched_backtest.parquet')