# 2.04 Forecasting Metrics. (This may be better suited as 2.05)

In [1]:
import pandas as pd 
import numpy as np
import os
import sys 
from pathlib import Path
import matplotlib.pyplot as plt

import tsforge as tsf

import warnings

* fetch metadata

In [2]:
# Set working directory to project root
ROOT_DIR = Path("../..").resolve()
os.chdir(ROOT_DIR)
sys.path.insert(0, str(ROOT_DIR))


warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-whitegrid")

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

OUTPUT_DIR = DATA_DIR / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

data = tsf.load_m5(data_dir=DATA_DIR, create_unique_id=True, verbose=False,include_hierarchy=True)


In [3]:
meta_data = data.select_dtypes(include=['category','object']).drop_duplicates(subset='unique_id').reset_index(drop=True)

In [4]:
df = pd.read_parquet(
    "/Users/jackrodenberg/Desktop/real-world-forecasting-foundations/notebooks/module_02_baselines/statsforecast_backtest.parquet",
)

* sku level metrics computation using utilsforecast, across all skus and timesteps

In [5]:
# model_cols = [col for col in df.columns if col not in ['unique_id','ds','y','cutoff']]

# metric_df = evaluate(
#     df = df,
#     metrics=[rmse,mae,bias,nd],
#     id_col='unique_id',
#     time_col='ds',
#     target_col='y',
#     models=model_cols
# )

# # inspect mean of the metrics across all skus and all timesteps
# metric_df.groupby("metric")[model_cols].mean()

* make bespoke metrics out of the baseline dataframe, here we can be more creative with custom error measures

In [6]:
e_df = df.melt(
    id_vars=["unique_id", "ds", "cutoff", "y"], var_name="model", value_name="y_pred"
).assign(
    error=lambda x: x["y"] - x["y_pred"], abs_error=lambda x: np.abs(x["error"]), sq_error = lambda x: np.square(x['error'])
)  # add granular error/abs error columns.. these will be used for bespoke metrics...

e_df = e_df.assign(
    timestep=e_df.groupby(["unique_id", "model", "cutoff"]).cumcount().add(1),
    horizon_group=lambda x: np.where(x["timestep"] > 4, "5-13", "1-4"),
)

e_df.head()


Unnamed: 0,unique_id,ds,cutoff,y,model,y_pred,error,abs_error,sq_error,timestep,horizon_group
0,FOODS_1_001_CA_1,2015-07-04,2015-06-27,2.0,Naive,2.0,0.0,0.0,0.0,1,1-4
1,FOODS_1_001_CA_1,2015-07-11,2015-06-27,2.0,Naive,2.0,0.0,0.0,0.0,2,1-4
2,FOODS_1_001_CA_1,2015-07-18,2015-06-27,7.0,Naive,2.0,5.0,5.0,25.0,3,1-4
3,FOODS_1_001_CA_1,2015-07-25,2015-06-27,4.0,Naive,2.0,2.0,2.0,4.0,4,1-4
4,FOODS_1_001_CA_1,2015-08-01,2015-06-27,2.0,Naive,2.0,0.0,0.0,0.0,5,5-13


In [7]:
# join in metadata 
e_df = e_df.merge(
    meta_data[['unique_id','item_id']],
    on=['unique_id'],
    how='inner',
    validate="m:1"
)

In [8]:
sku_lvl = e_df.groupby(["item_id","ds","cutoff","model"])[['y','y_pred','error']].transform("sum")
sku_lvl['abs_error'] = sku_lvl['error'].abs()
sku_lvl['sq_error'] = sku_lvl['error'] ** 2

sku_lvl.head()

Unnamed: 0,y,y_pred,error,abs_error,sq_error
0,38.0,26.0,12.0,12.0,144.0
1,41.0,26.0,15.0,15.0,225.0
2,38.0,26.0,12.0,12.0,144.0
3,30.0,26.0,4.0,4.0,16.0
4,31.0,26.0,5.0,5.0,25.0


In [9]:

# assign sku level totals to original table.. 
e_df = e_df.assign(sku_error=sku_lvl['error'],
    sku_abs_error=sku_lvl['abs_error'], sku_sq_error=sku_lvl['sq_error'], sku_y=sku_lvl['y'],sku_pred=sku_lvl['y_pred'])

In [10]:
# Define COV function... 
def coefficient_of_variation(values):
    """Calculate coefficient of variation (CV = std/mean).

    Measures relative variability. Lower CV = more stable forecasts.
    """
    return values.std() / np.maximum(values.mean(), 1e-8)


In [11]:
# ============================================================================
# STEP 1: Optimize DataFrame for Groupby Operations
# ============================================================================
# Converting to category dtype speeds up groupby by ~2-3x

categorical_columns = ["unique_id", "cutoff", "model", "horizon_group"]
e_df[categorical_columns] = e_df[categorical_columns].astype("category")

print(f"✓ Converted {len(categorical_columns)} columns to category dtype")
print(f"  DataFrame shape: {e_df.shape}")


✓ Converted 4 columns to category dtype
  DataFrame shape: (9512880, 17)


* compute jitter as forecast change for the same timestep (i.e., how much the forecast changes from one time step to the next for the same forecast horizon)

* talking point for jitter computation: our current CV config doesn't measure the change for same FTD over FCDs as we have non-overlapping folds.. 

In [12]:
# # jitter computation as mean abs jitter 
# jitter_df = (
#     e_df.pivot(
#         index=['unique_id','timestep','model'],
#         columns='cutoff',
#         values=['abs_error','sku_abs_error']
#     )
#     .stack(level=0) # stack index to get jitter for sku-store and sku level
#     .diff(axis=1)
#     .abs()
#     .mean(axis=1)
#     .reset_index(name='mean_abs_jitter')
# )

# jitter_df['level_3'] = jitter_df['level_3'].map({"abs_error":"y_jitter","sku_abs_error":"sku_jitter"})

# # pivot jitter df 
# jitter_df = jitter_df.pivot(
#     index=['unique_id','timestep','model'],
#     columns='level_3',
#     values='mean_abs_jitter'
# ).reset_index()

# # merge jitter computation on id, timestep and model
# e_df = e_df.merge(
#     jitter_df,
#     on=['unique_id','timestep','model'],
#     how='left',
#     validate="m:1"
# )

In [13]:
# ============================================================================
# STEP 3: Aggregate Errors by Model/Horizon/Series/Cutoff
# ============================================================================

groupby_keys = ["model", "horizon_group", "unique_id", "cutoff"]

base_metric_specs = [
    ("mae", "abs_error", "mean"),
    ("sum_ae", "abs_error", "sum"),
    ("sum_demand", "y", "sum"),
    ("bias", "error", "mean"),
    ("mse", "sq_error", "mean"),
  #  ("stability", "y_jitter", "sum"),
]

sku_metric_specs = [
    ("sku_mae", "sku_abs_error", "mean"),
    ("sku_sum_ae", "sku_abs_error", "sum"),
    ("sku_sum_demand", "sku_y", "sum"),
 #   ("sku_stability", "sku_jitter", "sum"),
    ("sku_bias", "sku_error", "mean"),
    ("sku_mse", "sku_sq_error", "mean"),
]

agg_dict = {name: (column, func) for name, column, func in base_metric_specs + sku_metric_specs}

aggregated_errors = e_df.groupby(groupby_keys, sort=False, observed=True, as_index=False).agg(
    **agg_dict
)

print("\u2713 Aggregated base- and SKU-level errors")
print(f"  Number of unique combinations: {len(aggregated_errors):,}")
print(f"  Groupby keys: {groupby_keys}")


✓ Aggregated base- and SKU-level errors
  Number of unique combinations: 1,463,520
  Groupby keys: ['model', 'horizon_group', 'unique_id', 'cutoff']


In [16]:
(
    aggregated_errors.query("model == 'Naive'")  # or .loc[aggregated_errors['model'] == 'Naive']
    .set_index(["unique_id", "horizon_group", "cutoff"])[['mae','sku_mae']]
    # .clip(lower=1e-10)
)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mae,sku_mae
unique_id,horizon_group,cutoff,Unnamed: 3_level_1,Unnamed: 4_level_1
FOODS_1_001_CA_1,1-4,2015-06-27,1.750000,10.750000
FOODS_1_001_CA_1,5-13,2015-06-27,2.222222,13.111111
FOODS_1_001_CA_1,1-4,2015-09-26,2.250000,6.500000
FOODS_1_001_CA_1,5-13,2015-09-26,2.666667,8.111111
FOODS_1_001_CA_1,1-4,2015-12-26,1.500000,16.000000
...,...,...,...,...
HOUSEHOLD_2_516_WI_3,5-13,2015-09-26,2.222222,5.333333
HOUSEHOLD_2_516_WI_3,1-4,2015-12-26,0.250000,3.250000
HOUSEHOLD_2_516_WI_3,5-13,2015-12-26,0.555556,2.333333
HOUSEHOLD_2_516_WI_3,1-4,2016-03-26,0.000000,2.250000


In [17]:
# ============================================================================
# NORMALIZE MAE BY NAIVE FORECAST PERFORMANCE
# ============================================================================

# Step 1: Extract Naive model's MAE for each unique_id/horizon_group/cutoff
naive_mae = (
    aggregated_errors.query("model == 'Naive'")  # or .loc[aggregated_errors['model'] == 'Naive']
    .set_index(["unique_id", "horizon_group", "cutoff"])[['mae','sku_mae']]
    #.clip(lower=1e-10)
)

print(f"✓ Extracted Naive MAE baseline")
print(f"  Number of Naive benchmarks: {len(naive_mae):,}")


# Step 2: Join back to all models and calculate relative MAE
aggregated_errors = (
    aggregated_errors.set_index(["unique_id", "horizon_group", "cutoff"])
    .assign(
        naive_mae_baseline=naive_mae['mae'],
        sku_mae_baseline=naive_mae['sku_mae'],
        mase=lambda df: df["mae"] / df["naive_mae_baseline"],
        sku_mase=lambda df: df["sku_mae"] / df["sku_mae_baseline"]
    )
    .reset_index()
)

print(f"✓ Calculated relative MAE (MAE / Naive MAE)")
print(f"\nRelative MAE by model:")
print(aggregated_errors.groupby("model")["mase"].agg(["mean", "median", "min", "max"]))

✓ Extracted Naive MAE baseline
  Number of Naive benchmarks: 243,920
✓ Calculated relative MAE (MAE / Naive MAE)

Relative MAE by model:
                  mean    median       min  max
model                                          
CrostonOptimized   inf  0.969634  0.000644  inf
HW52               inf  1.115370  0.004659  inf
MA4                inf  1.000000  0.000000  inf
Naive              1.0  1.000000  1.000000  1.0
SN52               inf  1.200000  0.000000  inf
StructuralTheta    inf  1.000032  0.004362  inf


In [19]:
# ============================================================================
# STEP 4: Calculate Derived Metrics
# ============================================================================

# Calculate RMSE from MSE
aggregated_errors = aggregated_errors.assign(
    rmse=lambda df: np.sqrt(
        df["mse"]
    ),  # Root Mean Squared Error
    # Weighted Mean Absolute Percentage Error
    wMAPE=lambda df: df["sum_ae"] / df["sum_demand"],
    sku_wMAPE=lambda df: df["sku_sum_ae"] / df["sku_sum_demand"]
)


In [22]:
# ============================================================================
# FINAL RESULT
# ============================================================================

error_table = aggregated_errors

print("\n" + "=" * 70)
print("ERROR TABLE COMPLETE")
print("=" * 70)
print(f"\nShape: {error_table.shape}")
print(f"\nColumns:")
for col in error_table.columns:
    print(f"  - {col}")

print(f"\nMetrics summary:")
print(error_table[["mae", "rmse", "mase", "wMAPE", "bias",]].describe())



ERROR TABLE COMPLETE

Shape: (1463520, 21)

Columns:
  - unique_id
  - horizon_group
  - cutoff
  - model
  - mae
  - sum_ae
  - sum_demand
  - bias
  - mse
  - sku_mae
  - sku_sum_ae
  - sku_sum_demand
  - sku_bias
  - sku_mse
  - naive_mae_baseline
  - mase
  - rmse
  - wMAPE
  - sku_mae_baseline
  - sku_mase
  - sku_wMAPE

Metrics summary:
                mae          rmse          mase         wMAPE          bias
count  1.463520e+06  1.463520e+06  1.434982e+06  1.434360e+06  1.463520e+06
mean   5.057615e+00  5.895721e+00           inf           inf  4.369787e-01
std    1.028684e+01  1.151721e+01           NaN           NaN  1.034859e+01
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00 -5.274183e+02
25%    1.257363e+00  1.581139e+00  8.586781e-01  4.285714e-01 -1.203492e+00
50%    2.500000e+00  2.977038e+00  1.000000e+00  6.693226e-01  1.111111e-01
75%    5.000000e+00  5.873670e+00  1.214753e+00  1.000000e+00  1.750000e+00
max    8.263656e+02  8.296942e+02           in

In [27]:
error_table.loc[(error_table['wMAPE'] == np.inf) | (error_table['mase'] == np.inf)] # when sum of demand is zero, we get a wMAPE of Inf, same to be said for mase often times... 

Unnamed: 0,unique_id,horizon_group,cutoff,model,mae,sum_ae,sum_demand,bias,mse,sku_mae,...,sku_sum_demand,sku_bias,sku_mse,naive_mae_baseline,mase,rmse,wMAPE,sku_mae_baseline,sku_mase,sku_wMAPE
98,FOODS_1_002_CA_3,1-4,2015-09-26,Naive,1.000000,4.000000,0.0,-1.000000,1.000000,9.750000,...,123.0,9.750000,127.250000,1.0,1.000000,1.000000,inf,9.750000,1.000000,0.317073
99,FOODS_1_002_CA_3,5-13,2015-09-26,Naive,1.000000,9.000000,0.0,-1.000000,1.000000,4.777778,...,216.0,3.000000,39.222221,1.0,1.000000,1.000000,inf,4.777778,1.000000,0.199074
112,FOODS_1_002_TX_1,1-4,2015-06-27,Naive,1.000000,4.000000,0.0,-1.000000,1.000000,5.000000,...,112.0,3.000000,38.000000,1.0,1.000000,1.000000,inf,5.000000,1.000000,0.178571
226,FOODS_1_003_WI_2,1-4,2015-09-26,Naive,2.000000,8.000000,0.0,-2.000000,4.000000,9.250000,...,131.0,-9.250000,93.250000,2.0,1.000000,2.000000,inf,9.250000,1.000000,0.282443
227,FOODS_1_003_WI_2,5-13,2015-09-26,Naive,2.000000,18.000000,0.0,-2.000000,4.000000,11.222222,...,277.0,-11.222222,154.111115,2.0,1.000000,2.000000,inf,11.222222,1.000000,0.364621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463414,HOUSEHOLD_2_515_TX_3,1-4,2016-03-26,StructuralTheta,0.436452,1.745807,0.0,-0.436452,0.190499,1.853559,...,31.0,-1.549362,4.485132,0.0,inf,0.436462,inf,2.250000,0.823804,0.239169
1463424,HOUSEHOLD_2_515_WI_2,1-4,2015-06-27,StructuralTheta,0.484161,1.936642,0.0,-0.484161,0.234427,6.541681,...,16.0,-6.541681,58.742828,0.0,inf,0.484176,inf,4.000000,1.635420,1.635420
1463497,HOUSEHOLD_2_516_WI_1,5-13,2015-06-27,StructuralTheta,0.595327,5.357941,0.0,-0.595327,0.354420,2.113121,...,93.0,-1.297808,6.004831,1.0,0.595327,0.595332,inf,2.333333,0.905623,0.204496
1463506,HOUSEHOLD_2_516_WI_2,1-4,2015-09-26,StructuralTheta,0.238612,0.954449,0.0,-0.238612,0.056936,3.625269,...,37.0,-1.513796,20.327490,0.0,inf,0.238612,inf,5.750000,0.630482,0.391921


In [28]:
error_table.query("wMAPE != inf and mase != inf").groupby("model")[["mae","rmse","wMAPE","mase"]].agg(['mean','median']).stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,mae,rmse,wMAPE,mase
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CrostonOptimized,mean,4.442025,5.195877,0.956469,1.09777
CrostonOptimized,median,2.25,2.684991,0.555851,0.944273
HW52,mean,5.666718,6.593264,1.11515,1.429038
HW52,median,2.754653,3.308298,0.684064,1.080049
MA4,mean,4.390932,5.158533,0.915407,1.027986
MA4,median,2.222222,2.633914,0.578947,0.98913
Naive,mean,4.835846,5.661499,0.978502,1.0
Naive,median,2.5,3.0,0.666667,1.0
SN52,mean,6.031936,7.126418,1.159529,1.566312
SN52,median,3.0,3.674235,0.777778,1.166667
