## 1. Notebook overview
Summarize bespoke executive metrics by recomputing error slices from the StatsForecast backtests.

In [189]:
#data manipulation 
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import plotly.graph_objects as go


# OS & utilities
from pathlib import Path
import sys
import warnings
from typing import Optional, Tuple


# Evaluation metrics
from utilsforecast.losses import (
    nd as wmape,

)
from utilsforecast.evaluation import evaluate


# --- Path Configuration (before local imports) ---
MODULE_DIR = Path().resolve()
PROJECT_ROOT = MODULE_DIR.parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# --- Local Imports ---
from src import (
    CacheManager,
    ArtifactManager,
    get_notebook_name,
    find_project_root,
)

# --- Settings ---
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-whitegrid")

# --- Paths ---
PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

# --- Managers ---
NB_NAME = get_notebook_name()
cache = CacheManager(PROJECT_ROOT / ".cache" / NB_NAME)
artifacts = ArtifactManager(PROJECT_ROOT / "artifacts")


# Plot styling
plt.style.use("seaborn-v0_8-whitegrid")

# set name of function
wmape.__name__ = "wmape"

# data contract
ID_COL = "unique_id"
TIME_COL = "ds"
TARGET_COL = "y"
FCD_COL = "cutoff"

data_contract_cols = [ID_COL, TIME_COL, TARGET_COL, FCD_COL]

## 2. Load StatsForecast backtest and relevant metadata
Pull the saved rolling-origin forecasts that feed the executive metrics.

In [190]:
df = artifacts.load("enriched_backtest")

✓ Loaded 'enriched_backtest' from 02_baselines/
   Shape: 55,491,800 × 8


In [191]:
# filter out our probabilistic predictions 
df = df.loc[~(df['model'].str.contains("-hi")) & ~(df['model'].str.contains("-lo"))]

In [192]:
# add item id as a column, split from unique_id
id_mapping = {id:id.rsplit("_",maxsplit=2)[0] 
for id in df[ID_COL].unique()}

dept_mapping = {id:id.split("_",maxsplit=1)[0] 
for id in df[ID_COL].unique()}

df['item_id'] = df[ID_COL].map(id_mapping)
df['dept_id'] = df[ID_COL].map(dept_mapping)

* prepare for metric generation

In [193]:
short_group_keys = [ID_COL,'item_id',"model","cutoff","horizon_group"]
long_group_keys = ['item_id',"model","cutoff","horizon_group"]

## Error Computation: MAE, WMAPE, BIAS
* Compute normalized error metrics for each baseline by horizon group.

In [194]:
#mask for long horizon
long_horizon_mask = (df['ds'] >= df['cutoff'] + pd.DateOffset(weeks=4))

# get separate dataframes... 
long_horizon_df = df.loc[long_horizon_mask]
short_horizon_df = df.loc[~long_horizon_mask]

In [195]:
# Aggregate long-horizon errors per item/model/cutoff
long_agg_df = (
    long_horizon_df
    .groupby(["item_id", "model", "cutoff"], as_index=False)
    .agg(
        mae=("abs_error", "mean"),
        sum_error=("error", "sum"),
        sum_abs_error=("abs_error", "sum"),
        sum_act=("y", "sum"),
    )
    .assign(
        wmape=lambda df: df["sum_abs_error"] / df["sum_act"],
        bias=lambda df: df["sum_error"] / df["sum_act"],
    )
)

# Bring in the anchor (SN52) WMAPE per item/cutoff and compute beat rate
long_score_df = (
    long_agg_df
    .merge(
        long_agg_df.query("model == 'SN52'")[["item_id", "cutoff", "wmape"]],
        on=["item_id", "cutoff"],
        how="left",
        suffixes=("", "_anchor"),
    )
    .assign(beat_rate=lambda df: df["wmape"] < df["wmape_anchor"])
    .groupby(["item_id", "model"], as_index=False)
    .agg(
        wmape_jitter=("wmape", "std"),
        wmape=("wmape", "mean"),
        bias=("bias", "mean"),
        mae=("mae", "mean"),
        beat_rate=("beat_rate", "mean"),
    )
)

long_score_df.head()

Unnamed: 0,item_id,model,wmape_jitter,wmape,bias,mae,beat_rate
0,FOODS_1_001,AutoETS,0.016464,0.571134,-0.08498,2.343314,1.0
1,FOODS_1_001,AutoTheta,0.040088,0.589645,-0.056855,2.415189,1.0
2,FOODS_1_001,Croston,0.030215,0.568578,-0.068994,2.333146,1.0
3,FOODS_1_001,HW52,0.047247,0.686115,-0.156303,2.799618,1.0
4,FOODS_1_001,MA4,0.06138,0.610655,0.014155,2.4975,1.0


In [196]:
short_agg_df = short_horizon_df.groupby(["unique_id", "model", "cutoff"], as_index=False).agg(
    mae=("abs_error", "mean"),
    sum_error=("error", "sum"),
    sum_abs_error=("abs_error", "sum"),
    sum_act=("y", "sum"),
)


short_agg_df = short_agg_df.assign(
    wmape=short_agg_df["sum_abs_error"] / short_agg_df["sum_act"],
    bias=short_agg_df["sum_error"] / short_agg_df["sum_act"],
)

short_agg_df.head()


Unnamed: 0,unique_id,model,cutoff,mae,sum_error,sum_abs_error,sum_act,wmape,bias
0,FOODS_1_001_CA_1,AutoETS,2015-06-21,2.394969,1.520349,7.184906,11.0,0.653173,0.138214
1,FOODS_1_001_CA_1,AutoETS,2015-09-20,2.693341,-0.205954,8.080023,12.0,0.673335,-0.017163
2,FOODS_1_001_CA_1,AutoETS,2015-12-20,2.904337,8.713012,8.713012,8.0,1.089127,1.089127
3,FOODS_1_001_CA_1,AutoETS,2016-03-20,2.77568,-8.32704,8.32704,25.0,0.333082,-0.333082
4,FOODS_1_001_CA_1,AutoTheta,2015-06-21,2.350363,1.033877,7.05109,11.0,0.641008,0.093989


* when we have zero actuals, wmape and bias are undefined... 

In [197]:
short_agg_df.loc[short_agg_df['wmape'] == np.inf] # notice that when sum_y == 0, wmape == inf! 

Unnamed: 0,unique_id,model,cutoff,mae,sum_error,sum_abs_error,sum_act,wmape,bias
337,FOODS_1_002_CA_3,AutoETS,2015-09-20,0.932960,2.798879,2.798879,0.0,inf,inf
338,FOODS_1_002_CA_3,AutoETS,2015-12-20,0.074152,0.222455,0.222455,0.0,inf,inf
341,FOODS_1_002_CA_3,AutoTheta,2015-09-20,0.931969,2.795906,2.795906,0.0,inf,inf
342,FOODS_1_002_CA_3,AutoTheta,2015-12-20,0.048088,0.144263,0.144263,0.0,inf,inf
345,FOODS_1_002_CA_3,Croston,2015-09-20,1.884060,5.652180,5.652180,0.0,inf,inf
...,...,...,...,...,...,...,...,...,...
853699,HOUSEHOLD_2_516_WI_3,AutoTheta,2016-03-20,0.628026,1.884077,1.884077,0.0,inf,inf
853703,HOUSEHOLD_2_516_WI_3,Croston,2016-03-20,0.747341,2.242023,2.242023,0.0,inf,inf
853707,HOUSEHOLD_2_516_WI_3,HW52,2016-03-20,0.856524,2.569572,2.569572,0.0,inf,inf
853711,HOUSEHOLD_2_516_WI_3,MA4,2016-03-20,0.750000,2.250000,2.250000,0.0,inf,inf


In [199]:


def boxplot_against_anchor(
    df,
    metric: str = "wmape",
    anchor_model: str = "SN52",
    clip_quantiles: Tuple[float, float] = (0.05, 0.95),
    show_points: bool = True,
    show_stats: bool = True,
    figsize: Tuple[int, int] = (900, 600),
    title: Optional[str] = None,
    color_scheme: str = "default",
):
    """
    Create enhanced boxplot comparing models against an anchor.

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: model, metric values
    metric : str
        Metric column name to plot
    anchor_model : str
        Model to highlight as anchor/baseline
    clip_quantiles : tuple
        (lower, upper) quantiles for outlier clipping
    show_points : bool
        Whether to overlay individual data points
    show_stats : bool
        Whether to show median values and deltas from anchor
    figsize : tuple
        (width, height) in pixels
    title : str, optional
        Custom title (auto-generated if None)
    color_scheme : str
        'default', 'viridis', or 'red_blue'
    """

    df = df.copy()

    # Create anchor flag
    df["is_anchor"] = df["model"] == anchor_model

    # Clip outliers
    if clip_quantiles:
        lower = df[metric].quantile(clip_quantiles[0])
        upper = df[metric].quantile(clip_quantiles[1])
        df[f"{metric}_clipped"] = df[metric].clip(lower=lower, upper=upper)
        plot_metric = f"{metric}_clipped"
    else:
        plot_metric = metric

    # Calculate statistics for sorting and annotations
    model_stats = df.groupby("model")[plot_metric].agg(["median", "mean", "count"]).reset_index()
    anchor_median = model_stats[model_stats["model"] == anchor_model]["median"].values[0]
    model_stats["delta_from_anchor"] = model_stats["median"] - anchor_median
    model_stats["delta_pct"] = model_stats["delta_from_anchor"] / anchor_median * 100

    # Sort models by median performance (anchor first)
    model_stats = model_stats.sort_values(["model"], key=lambda x: x != anchor_model).sort_values(
        "median"
    )
    model_order = model_stats["model"].tolist()
    df["model"] = pd.Categorical(df["model"], categories=model_order, ordered=True)
    df = df.sort_values("model")

    # Color schemes
    color_schemes = {
        "default": {False: "#4C78A8", True: "#E45756"},
        "viridis": {False: "#440154", True: "#FDE724"},
        "red_blue": {False: "#3182bd", True: "#e6550d"},
    }
    colors = color_schemes.get(color_scheme, color_schemes["default"])

    # Create figure
    fig = go.Figure()

    # Add box plots
    for is_anchor in [False, True]:
        mask = df["is_anchor"] == is_anchor
        if not mask.any():
            continue

        fig.add_trace(
            go.Box(
                y=df.loc[mask, "model"],
                x=df.loc[mask, plot_metric],
                name="Anchor" if is_anchor else "Other Models",
                marker_color=colors[is_anchor],
               # boxmean="sd",  # show mean and std
                orientation="h",
                showlegend=True,
                line=dict(width=2),
                marker=dict(size=4, line=dict(width=1, color="white")) if show_points else None,
                boxpoints="outliers" if show_points else False,
            )
        )

    # Add annotations for deltas
    if show_stats:
        annotations = []
        for _, row in model_stats.iterrows():
            if row["model"] == anchor_model:
                text = f"<b>{row['median']:.3f}</b> (baseline)"
            else:
                delta_sign = "+" if row["delta_from_anchor"] > 0 else ""
                text = f"{row['median']:.3f} ({delta_sign}{row['delta_pct']:.1f}%)"

            annotations.append(
                dict(
                    x=df[df["model"] == row["model"]][plot_metric].max() * 1.02,
                    y=row["model"],
                    text=text,
                    showarrow=False,
                    xanchor="left",
                    font=dict(size=9, color="#333333"),
                )
            )
        fig.update_layout(annotations=annotations)

    # Add vertical line at anchor median
    fig.add_vline(
        x=anchor_median,
        line_dash="dash",
        line_color="rgba(228, 87, 86, 0.3)",
        line_width=2,
        annotation_text=f"Anchor: {anchor_median:.3f}",
        annotation_position="top",
    )

    # Layout
    title_text = title or f"{metric.upper()} Distribution vs Anchor ({anchor_model})"
    fig.update_layout(
        title=dict(text=title_text, x=0.5, xanchor="center", font=dict(size=16, color="#333333")),
        xaxis_title=metric.upper(),
        yaxis_title="Model",
        width=figsize[0],
        height=figsize[1],
        hovermode="closest",
        plot_bgcolor="white",
        paper_bgcolor="white",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=100, r=150, t=80, b=60),
    )

    # Grid styling
    fig.update_xaxes(
        showgrid=True,
        gridwidth=1,
        gridcolor="rgba(128, 128, 128, 0.2)",
        zeroline=True,
        zerolinewidth=2,
        zerolinecolor="rgba(128, 128, 128, 0.3)",
    )
    fig.update_yaxes(showgrid=False)

    # Print summary statistics
    if show_stats:
        print(f"\n{'='*60}")
        print(f"SUMMARY: {metric.upper()} vs {anchor_model}")
        print(f"{'='*60}")
        print(f"{'Model':<15} {'Median':>10} {'Delta':>10} {'Delta %':>10} {'N':>8}")
        print(f"{'-'*60}")
        for _, row in model_stats.iterrows():
            delta_sign = "+" if row["delta_from_anchor"] > 0 else ""
            print(
                f"{row['model']:<15} {row['median']:>10.4f} "
                f"{delta_sign}{row['delta_from_anchor']:>9.4f} "
                f"{delta_sign}{row['delta_pct']:>9.1f}% {int(row['count']):>8}"
            )
        print(f"{'='*60}\n")

    return fig


# With more customization:
fig = boxplot_against_anchor(
    long_score_df,
    metric="wmape",
    anchor_model="SN52",
    clip_quantiles=(0.05, 0.95),  # Less aggressive clipping
    show_points=True,
    show_stats=True,
    color_scheme="red_blue",
    figsize=(1000, 700),
)
fig.show()



SUMMARY: WMAPE vs SN52
Model               Median      Delta    Delta %        N
------------------------------------------------------------
AutoTheta           0.6330   -0.1880     -22.9%     3049
AutoETS             0.6389   -0.1821     -22.2%     3049
Croston             0.6416   -0.1794     -21.9%     3049
MA4                 0.6451   -0.1759     -21.4%     3049
Naive               0.7324   -0.0886     -10.8%     3049
HW52                0.7530   -0.0680      -8.3%     3049
SN52                0.8210    0.0000       0.0%     3049



In [204]:
long_score_df.groupby("model").agg(wmape_jitter = ("wmape_jitter", "mean"),beat_rate=("beat_rate", "sum"),wmape=("wmape", "mean"),bias=("bias","mean")).reset_index()

Unnamed: 0,model,wmape_jitter,beat_rate,wmape,bias
0,AutoETS,5.402913,2609.0,inf,
1,AutoTheta,5.488267,2627.25,inf,
2,Croston,6.737795,2727.0,inf,inf
3,HW52,4.600546,2288.75,inf,
4,MA4,5.428597,2575.5,inf,inf
5,Naive,4.973532,2048.25,inf,inf
6,SN52,3.88452,0.0,inf,inf
