09 final forecast

In [None]:
# =========================
# 09_final_forecast.py
# FINAL (po poprawkach):
# - segmentacja automatyczna jak w 08
# - OTHER -> baseline-best (NIE model_C)
# - ML recursive tylko dla A/B/C
# - flatness-fix: płaskie ML -> baseline-best
# - fallback_last dla serii nieobecnych w df_hist (features)
# =========================

import numpy as np
import pandas as pd
from pathlib import Path
import joblib

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)

DATA_DIR = Path("data")
FEAT_DIR = DATA_DIR / "features"
BT_DIR   = DATA_DIR / "backtesting"
MODEL_DIR = DATA_DIR / "models"

HIST_PATH = FEAT_DIR / "features_level_a.parquet"          # musi zawierać: demand (TARGET_RAW)
BASELINE_PATH = BT_DIR / "baseline_best_per_series.parquet"  # country, sku, best_baseline
MASTER_RAW = DATA_DIR / "master_raw.parquet"               # musi zawierać: demand_raw

OUT_FUTURE = DATA_DIR / "forecast_future.parquet"
OUT_METRICS = DATA_DIR / "metrics_summary.parquet"         # (opcjonalnie, jeśli będziesz chciała dopisać)

TARGET_RAW = "demand"              # w df_hist
TARGET_RAW_FALLBACK = "demand_raw" # w master_raw
CAT_FEATURES = ["country", "sku"]


schema = joblib.load(MODEL_DIR / "model_schema.pkl")
FEATURE_COLS = schema["feature_cols"]
CAT_FEATURES = schema["cat_features"]
NUM_FEATURES = schema["num_features"]
CAT_LEVELS = schema["cat_levels"]

# -------------------------
# HELPERS
# -------------------------
def wape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.abs(y_true).sum()
    return np.nan if denom <= 1e-12 else np.abs(y_true - y_pred).sum() / denom


def safe_impute_num(df, num_cols):
    out = df.copy()
    for c in num_cols:
        if c not in out.columns:
            out[c] = np.nan

    grp = out.groupby(["country", "sku"], observed=True)
    for c in num_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")
        out[c] = out[c].fillna(grp[c].transform("median"))
        out[c] = out[c].fillna(out[c].median())

    return out

def rolling_mean_forecast(y_hist, horizon, window=8):
    y = pd.to_numeric(y_hist, errors="coerce").fillna(0.0)
    if len(y) == 0:
        return np.zeros(horizon)
    w = min(window, len(y))
    mu = float(y.iloc[-w:].mean())
    return np.repeat(mu, horizon)

def seasonal_naive_damped(y_hist, horizon, season=52, alpha=0.6):
    """
    Mieszanka: alpha * seasonal_naive + (1-alpha) * last_value
    Stabilniejsze, ale nie płaskie.
    """
    y = pd.to_numeric(y_hist, errors="coerce").fillna(0.0)
    s = seasonal_naive(y, horizon, season=season)
    last = naive_last(y, horizon)
    return alpha * s + (1 - alpha) * last

def hybrid_other_forecast(y_hist, horizon):
    """
    Szybka reguła dla OTHER / fallback:
    - dużo historii: seasonal_naive_damped + lekki drift
    - średnio: seasonal_naive_damped
    - mało: rolling mean + mini drift
    """
    y = pd.to_numeric(y_hist, errors="coerce").fillna(0.0).reset_index(drop=True)
    n = len(y)

    if n >= 104:
        base = seasonal_naive_damped(y, horizon, season=52, alpha=0.7)
        d = drift(y, horizon)
        # lekki trend (20% driftu)
        return 0.8 * base + 0.2 * d

    if n >= 52:
        return seasonal_naive_damped(y, horizon, season=52, alpha=0.7)

    # krótkie serie
    base = rolling_mean_forecast(y, horizon, window=8)
    d = drift(y, horizon)
    return 0.85 * base + 0.15 * d


BASE_NUM = [
    "n_dc","week_sin","week_cos","zero_share","ADI","CV2",
    "weeks_since_nonzero","is_zero","is_outlier"
]
KEEP_LAGS = {1,2,4,8,13}
KEEP_ROLLS = {4,8,13}

def naive_last(y_hist, horizon):
    last = float(y_hist.iloc[-1]) if len(y_hist) else 0.0
    return np.repeat(last, horizon)

def seasonal_naive(y_hist, horizon, season=52):
    if len(y_hist) < season:
        return naive_last(y_hist, horizon)
    start = len(y_hist) - season
    return y_hist.iloc[start:start + horizon].values[:horizon]

def drift(y_hist, horizon):
    if len(y_hist) < 2:
        return naive_last(y_hist, horizon)
    slope = (float(y_hist.iloc[-1]) - float(y_hist.iloc[0])) / (len(y_hist) - 1)
    return float(y_hist.iloc[-1]) + slope * np.arange(1, horizon + 1)

def make_best_baseline_forecast_2026(raw_df, baseline_df):
    """
    raw_df: master_raw.parquet (country, sku, week_start, demand_raw)
    baseline_df: baseline_best_per_series.parquet (country, sku, best_baseline)
    """
    dates_2026 = pd.date_range("2026-01-05", "2026-12-28", freq="W-MON")
    horizon = len(dates_2026)

    out_rows = []
    baseline_map = baseline_df.set_index(["country","sku"])["best_baseline"].to_dict()

    for (c, sku), g in raw_df.groupby(["country","sku"], observed=True):
        g = g.sort_values("week_start")
        y = pd.to_numeric(g[TARGET_RAW_FALLBACK], errors="coerce").fillna(0.0).reset_index(drop=True)

        model = str(baseline_map.get((c, sku), "naive")).strip().lower()
        if model == "seasonal_naive":
            preds = seasonal_naive(y, horizon, season=52)
        elif model == "drift":
            preds = drift(y, horizon)
        else:
            preds = naive_last(y, horizon)

        sigma = float(np.std(y.values, ddof=0)) if len(y) else 0.0
        k = 1.28

        for d, yp in zip(dates_2026, preds):
            yp = max(0.0, float(yp))
            p10 = max(0.0, yp - k*sigma)
            p90 = max(0.0, yp + k*sigma)

            out_rows.append({
                "country": c,
                "dc_id": "ALL",
                "sku": sku,
                "product_name": None,
                "segment": None,
                "date": d,
                "y_pred": yp,
                "PI10": p10,
                "PI90": p90,
                "model_type": f"baseline_{model}",
            })

    return pd.DataFrame(out_rows)


def fallback_forecast_2026(raw_df, series_df):
    dates_2026 = pd.date_range("2026-01-05", "2026-12-28", freq="W-MON")
    horizon = len(dates_2026)
    out_rows = []

    for _, s in series_df.iterrows():
        c, sku = s["country"], s["sku"]
        g = raw_df[(raw_df["country"].astype(str)==str(c)) & (raw_df["sku"].astype(str)==str(sku))].copy()
        g = g.sort_values("week_start")
        y = pd.to_numeric(g[TARGET_RAW_FALLBACK], errors="coerce").fillna(0.0).reset_index(drop=True)

        preds = hybrid_other_forecast(y, horizon)

        sigma = float(np.std(y.values, ddof=0)) if len(y) else 0.0
        k = 1.28

        for d, yp in zip(dates_2026, preds):
            yp = max(0.0, float(yp))
            p10 = max(0.0, yp - k*sigma)
            p90 = max(0.0, yp + k*sigma)

            out_rows.append({
                "country": c,
                "dc_id": "ALL",
                "sku": sku,
                "product_name": s.get("product_name", None),
                "segment": "FALLBACK",
                "date": d,
                "y_pred": yp,
                "PI10": p10,
                "PI90": p90,
                "model_type": "other_hybrid"
            })
    return pd.DataFrame(out_rows)

def other_forecast_2026_from_raw(raw_df, other_keys_df):
    dates_2026 = pd.date_range("2026-01-05", "2026-12-28", freq="W-MON")
    horizon = len(dates_2026)
    out_rows = []

    for _, r in other_keys_df.iterrows():
        c = str(r["country"])
        sku = str(r["sku"])

        g = raw_df[(raw_df["country"].astype(str)==c) & (raw_df["sku"].astype(str)==sku)].copy()
        g = g.sort_values("week_start")
        y = pd.to_numeric(g[TARGET_RAW_FALLBACK], errors="coerce").fillna(0.0).reset_index(drop=True)

        preds = hybrid_other_forecast(y, horizon)

        sigma = float(np.std(y.values, ddof=0)) if len(y) else 0.0
        k = 1.28

        product_name = None
        if "product_name" in g.columns and len(g):
            product_name = str(g["product_name"].dropna().iloc[-1]) if g["product_name"].notna().any() else None

        for d, yp in zip(dates_2026, preds):
            yp = max(0.0, float(yp))
            p10 = max(0.0, yp - k*sigma)
            p90 = max(0.0, yp + k*sigma)

            out_rows.append({
                "country": c,
                "dc_id": "ALL",
                "sku": sku,
                "product_name": product_name,
                "segment": "OTHER",
                "date": d,
                "y_pred": yp,
                "PI10": p10,
                "PI90": p90,
                "model_type": "other_hybrid"
            })

    return pd.DataFrame(out_rows)


# -------------------------
# LOAD DATA + MODELS
# -------------------------
df_hist = pd.read_parquet(HIST_PATH)
df_base = pd.read_parquet(BASELINE_PATH)

if "baseline_wape" not in df_base.columns and "wape" in df_base.columns:
    df_base = df_base.rename(columns={"wape": "baseline_wape"})

model_A = joblib.load(MODEL_DIR / "lgbm_segment_A.pkl")
model_B = joblib.load(MODEL_DIR / "lgbm_segment_B.pkl")
model_C = joblib.load(MODEL_DIR / "lgbm_segment_C.pkl")
models = {"A": model_A, "B": model_B, "C": model_C}

df_hist["week_start"] = pd.to_datetime(df_hist["week_start"], errors="coerce")
df_hist = df_hist[df_hist["week_start"].notna()].copy()

# (opcjonalne) Twoje mapowanie nazw
df_hist["product_name"] = df_hist["product_name"].replace({
    "*GAZPACHO": "GAZPACHO (17783-000)"
})

missing_num = [c for c in NUM_FEATURES if c not in df_hist.columns]
if missing_num:
    for c in missing_num:
        df_hist[c] = np.nan  # do imputacji później

df_hist[TARGET_RAW] = pd.to_numeric(df_hist[TARGET_RAW], errors="coerce").fillna(0.0).clip(lower=0.0)
df_hist = safe_impute_num(df_hist, NUM_FEATURES)

print("NUM_FEATURES (from schema):", len(NUM_FEATURES))
print("CAT_FEATURES (from schema):", CAT_FEATURES)

print("NUM_FEATURES:", len(NUM_FEATURES))
print("Lagi:", [c for c in NUM_FEATURES if c.startswith("lag_")])
print("Rolle:", [c for c in NUM_FEATURES if c.startswith("roll_")])

# -------------------------
# SEGMENTATION (AUTO like 08)
# -------------------------
profile = (
    df_hist
    .groupby(["product_name", "country"], observed=True)
    .agg(
        mean=(TARGET_RAW, "mean"),
        cv=(TARGET_RAW, lambda s: float(np.nanstd(s) / (np.nanmean(s) + 1e-9))),
    )
    .reset_index()
)

SEG_A_SET = set(profile.loc[(profile["mean"] >= 1500) & (profile["cv"] <= 0.3), "product_name"].astype(str))
SEG_B_SET = set(profile.loc[(profile["mean"] < 1500) & (profile["mean"] >= 200) & (profile["cv"] > 0.3) & (profile["cv"] <= 0.7), "product_name"].astype(str))
SEG_C_SET = set(profile.loc[(profile["mean"] < 200) & (profile["cv"] > 0.7), "product_name"].astype(str))

def assign_segment(product_name: str) -> str:
    p = str(product_name)
    if p in SEG_A_SET: return "A"
    if p in SEG_B_SET: return "B"
    if p in SEG_C_SET: return "C"
    return "OTHER"   # kluczowa zmiana vs Twoje stare 09


# -------------------------
# STATIC NUM FEATURES (bez lagów/rolli/week trig)
# -------------------------
KEEP_LAGS = sorted(list(KEEP_LAGS))
KEEP_ROLLS = sorted(list(KEEP_ROLLS))

def _static_cols(all_num_features):
    stat = []
    for c in all_num_features:
        if c in ("week_sin", "week_cos"):
            continue
        if c.startswith("lag_"):
            continue
        if c.startswith("roll_mean_") or c.startswith("roll_std_"):
            continue
        stat.append(c)
    return stat

STATIC_NUM = _static_cols(NUM_FEATURES)


# -------------------------
# ML RECURSIVE FORECAST (A/B/C only; OTHER excluded)
# -------------------------
def recursive_forecast_2026(
    df_hist: pd.DataFrame,
    models: dict,
    cat_features=("country","sku"),
    target_col="demand",
    num_features=None,
    keep_lags=None,
    keep_rolls=None,
):
    dfh = df_hist.copy()
    dfh["week_start"] = pd.to_datetime(dfh["week_start"], errors="coerce")
    dfh = dfh[dfh["week_start"].notna()].copy()
    dfh[target_col] = pd.to_numeric(dfh[target_col], errors="coerce").fillna(0.0).clip(lower=0.0)

    dates_2026 = pd.date_range("2026-01-05", "2026-12-28", freq="W-MON")
    out_rows = []

    keys = [c for c in ["country","sku"] if c in dfh.columns]

    for (country, sku), g in dfh.groupby(keys, observed=True):
        g = g.sort_values("week_start")
        y_hist = g[target_col].astype(float).tolist()

        last = g.tail(1).iloc[0].to_dict()
        product_name = last.get("product_name", None)
        segment = assign_segment(product_name)

        # KLUCZOWE: OTHER NIE jest forecastowany ML tutaj
        if segment == "OTHER":
            continue

        static_vals = {c: float(last.get(c, np.nan)) for c in STATIC_NUM if num_features and c in num_features}
        for c in STATIC_NUM:
            if num_features and c in num_features:
                if not np.isfinite(static_vals.get(c, np.nan)):
                    static_vals[c] = float(dfh[c].median()) if c in dfh.columns else 0.0

        sigma = float(np.std(np.asarray(y_hist), ddof=0)) if len(y_hist) else 0.0
        k = 1.28

        for d in dates_2026:
            row = {
                "country": country,
                "sku": sku,
                "week_start": d,
                "product_name": product_name,
                "segment": segment,
            }

            for cf in cat_features:
                if cf == "country":
                    row[cf] = country
                elif cf == "sku":
                    row[cf] = sku

            if num_features and "week_sin" in num_features:
                row["week_sin"] = float(np.sin(2*np.pi*int(d.isocalendar().week)/52.0))
            if num_features and "week_cos" in num_features:
                row["week_cos"] = float(np.cos(2*np.pi*int(d.isocalendar().week)/52.0))

            for c, v in static_vals.items():
                row[c] = v

            for l in keep_lags or []:
                col = f"lag_{l}"
                if num_features and col in num_features:
                    row[col] = float(y_hist[-l]) if len(y_hist) >= l else float(np.mean(y_hist) if len(y_hist) else 0.0)

            for w in keep_rolls or []:
                mcol = f"roll_mean_{w}"
                scol = f"roll_std_{w}"
                tail = y_hist[-w:] if len(y_hist) >= w else y_hist[:]
                if num_features and mcol in num_features:
                    row[mcol] = float(np.mean(tail)) if len(tail) else 0.0
                if num_features and scol in num_features:
                    row[scol] = float(np.std(tail, ddof=0)) if len(tail) else 0.0

            if num_features and "is_outlier" in num_features:
                row["is_outlier"] = 0.0

            if num_features and "is_zero" in num_features:
                lag1 = row.get("lag_1", 0.0)
                row["is_zero"] = float(1.0 if lag1 == 0 else 0.0)

            if num_features and "weeks_since_nonzero" in num_features:
                ws = 0
                for v in reversed(y_hist):
                    if v == 0:
                        ws += 1
                    else:
                        break
                row["weeks_since_nonzero"] = float(ws)

            X_row = pd.DataFrame([row])

            for cf in cat_features:
                if cf in X_row.columns:
                    X_row[cf] = X_row[cf].astype("category")

            X_row = safe_impute_num(X_row, num_features)
            X = X_row[list(cat_features) + list(num_features)]

            if segment == "A":
                y_log = float(models["A"].predict(X)[0])
            elif segment == "B":
                y_log = float(models["B"].predict(X)[0])
            else:  # segment == "C"
                y_log = float(models["C"].predict(X)[0])

            y_pred = float(np.expm1(y_log))
            y_pred = max(0.0, y_pred)

            p10 = max(0.0, y_pred - k*sigma)
            p90 = max(0.0, y_pred + k*sigma)

            out_rows.append({
                "country": country,
                "dc_id": "ALL",
                "sku": sku,
                "product_name": product_name,
                "segment": segment,
                "date": d,
                "y_pred": y_pred,
                "PI10": p10,
                "PI90": p90,
                "model_type": "ml_recursive"
            })

            # recursive update
            y_hist.append(y_pred)

    return pd.DataFrame(out_rows)


# -------------------------
# MASTER_RAW + MISSING SERIES
# -------------------------
raw = pd.read_parquet(MASTER_RAW)
raw["week_start"] = pd.to_datetime(raw["week_start"], errors="coerce")
raw = raw[raw["week_start"].notna()].copy()
raw[TARGET_RAW_FALLBACK] = pd.to_numeric(raw[TARGET_RAW_FALLBACK], errors="coerce").fillna(0.0).clip(lower=0.0)

all_series = raw[["country","sku","product_name"]].drop_duplicates()
ml_series = df_hist[["country","sku","product_name"]].drop_duplicates()

missing = all_series.merge(ml_series, on=["country","sku"], how="left", indicator=True)
missing = missing[missing["_merge"]=="left_only"][["country","sku","product_name_x"]].rename(columns={"product_name_x":"product_name"})

print("Series missing in df_hist (features):", missing.shape[0])

df_fallback = fallback_forecast_2026(raw, missing)


# -------------------------
# 1) ML FORECAST (A/B/C only)
# -------------------------
forecast_ml = recursive_forecast_2026(
    df_hist=df_hist,
    models=models,
    cat_features=CAT_FEATURES,
    target_col=TARGET_RAW,
    num_features=NUM_FEATURES,
    keep_lags=KEEP_LAGS,
    keep_rolls=KEEP_ROLLS
)

print("forecast_ml rows:", len(forecast_ml))
print("forecast_ml model_type counts:\n", forecast_ml["model_type"].value_counts().head(10))


# -------------------------
# 2) FLATNESS FIX: płaskie ML -> baseline-best
# -------------------------
if not forecast_ml.empty:
    flat_stats = (
        forecast_ml.groupby(["country","sku"], observed=True)["y_pred"]
        .agg(["min","max","std"]).reset_index()
    )
    FLAT_STD_EPS = 1e-6
    flat_keys = set(map(tuple, flat_stats.loc[flat_stats["std"].fillna(0.0) <= FLAT_STD_EPS, ["country","sku"]].values))
    print("Ile serii ML płaskich:", len(flat_keys), "na", flat_stats.shape[0])

    df_best_baseline_2026 = make_best_baseline_forecast_2026(raw, df_base)

    idx = forecast_ml.set_index(["country","sku"]).index
    ml_ok = forecast_ml[~idx.isin(flat_keys)].copy()
    bb_flat = df_best_baseline_2026[df_best_baseline_2026.set_index(["country","sku"]).index.isin(flat_keys)].copy()

    forecast_ml_fixed = pd.concat([ml_ok, bb_flat], ignore_index=True)
else:
    df_best_baseline_2026 = make_best_baseline_forecast_2026(raw, df_base)
    forecast_ml_fixed = pd.DataFrame()

if not forecast_ml_fixed.empty:
    hist52 = (
        df_hist.sort_values(["country","sku","week_start"])
              .groupby(["country","sku"], observed=True)
              .tail(52)
    )
    hist_stats = (
        hist52.groupby(["country","sku"], observed=True)[TARGET_RAW]
              .mean()
              .reset_index(name="hist_mean_52")
    )

    ml_stats = (
        forecast_ml_fixed.groupby(["country","sku"], observed=True)["y_pred"]
                        .mean()
                        .reset_index(name="fc_mean")
    )

    diag = ml_stats.merge(hist_stats, on=["country","sku"], how="left")
    diag["ratio"] = diag["fc_mean"] / diag["hist_mean_52"].replace(0, np.nan)

    HI = 1.8
    LO = 0.55

    bad = diag.loc[(diag["ratio"] > HI) | (diag["ratio"] < LO), ["country","sku"]]
    bad_keys = set(map(tuple, bad.values))
    print("RATIO_GUARDRAIL bad series:", len(bad_keys))

    if bad_keys:
        df_best_baseline_2026 = make_best_baseline_forecast_2026(raw, df_base)

        idx = forecast_ml_fixed.set_index(["country","sku"]).index
        ml_ok = forecast_ml_fixed[~idx.isin(bad_keys)].copy()

        bb_bad = df_best_baseline_2026[
            df_best_baseline_2026.set_index(["country","sku"]).index.isin(bad_keys)
        ].copy()
        bb_bad["model_type"] = "baseline_guardrail"

        forecast_ml_fixed = pd.concat([ml_ok, bb_bad], ignore_index=True)


# -------------------------
# 3) OTHER: HYBRID forecast (non-flat)
# -------------------------
hist_keys = df_hist[["country","sku","product_name"]].drop_duplicates().copy()
hist_keys["segment"] = hist_keys["product_name"].astype(str).map(assign_segment)

other_keys = hist_keys[hist_keys["segment"] == "OTHER"][["country","sku"]].drop_duplicates()
print("Series OTHER in df_hist:", other_keys.shape[0])

bb_other = pd.DataFrame()
if other_keys.shape[0] > 0:
    bb_other = other_forecast_2026_from_raw(raw, other_keys)

print("bb_other rows:", len(bb_other))


# -------------------------
# 4) FINAL CONCAT: ML_fixed + baseline OTHER + fallback
# -------------------------
forecast_all = pd.concat([forecast_ml_fixed, bb_other, df_fallback], ignore_index=True)

# --- FIX: kanoniczna nazwa GAZPACHO ---
forecast_all["country"] = forecast_all["country"].astype(str)
forecast_all["sku"] = forecast_all["sku"].astype(str)

mask = (
    (forecast_all["country"] == "Spain") &
    (forecast_all["sku"] == "00119-066-001")
)


def canon_name_mode(s: pd.Series):
    s = s.dropna().astype(str)
    if s.empty:
        return None
    return s.value_counts().index[0]

OVERRIDE = {
    ("Spain", "00119-066-001"): "GAZPACHO (17783-000)"
}

# enforce canonical product_name (same rule as master_raw)
dim_names = (
    raw.groupby(["country","sku"], as_index=False)["product_name"]
       .agg(product_name_canon=canon_name_mode)
)
dim_names["product_name_canon"] = dim_names.apply(
    lambda r: OVERRIDE.get((r["country"], r["sku"]), r["product_name_canon"]),
    axis=1
)

forecast_all = forecast_all.drop(columns=["product_name"], errors="ignore")
forecast_all = forecast_all.merge(dim_names, on=["country","sku"], how="left")
forecast_all = forecast_all.rename(columns={"product_name_canon":"product_name"})


key = ["country","dc_id","sku","date"]
forecast_all = forecast_all.drop_duplicates(subset=key, keep="first")
forecast_all = forecast_all.sort_values(["country","dc_id","sku","date"])

forecast_all.to_parquet(OUT_FUTURE, index=False)
print("Saved:", OUT_FUTURE.resolve(), forecast_all.shape)
print("model_type counts:\n", forecast_all["model_type"].value_counts().head(20))

print("Serie w forecast_all:", forecast_all[["country","dc_id","sku"]].drop_duplicates().shape[0])
print("SKU w forecast_all:", forecast_all[["country","sku"]].drop_duplicates().shape[0])
print("Tygodnie 2026:", forecast_all["date"].nunique(), forecast_all["date"].min(), forecast_all["date"].max())


# -------------------------
# QUICK CHECKS (diagnostyka)
# -------------------------
fc = forecast_all.copy()
fc["date"] = pd.to_datetime(fc["date"], errors="coerce")

print("\n[CHECK] Duplicate rows on (country,dc_id,sku,date):", int(fc.duplicated(subset=key).sum()))

counts_csku_date = fc.groupby(["country","sku","date"], observed=True).size()
max_per_date = int(counts_csku_date.max()) if len(counts_csku_date) else 0
print("[CHECK] max rows per (country,sku,date):", max_per_date)

if "PI10" in fc.columns and "PI90" in fc.columns:
    print("[CHECK] PI10 > PI90 rows:", int((fc["PI10"] > fc["PI90"]).sum()))

print("[CHECK] negative y_pred rows:", int((fc["y_pred"] < 0).sum()))
print("[CHECK] share of y_pred == 0:", float((fc["y_pred"] == 0).mean()))

# flatness per series (country,dc_id,sku)
stat = (
    fc.groupby(["country","dc_id","sku"], observed=True)["y_pred"]
    .agg(y_min="min", y_max="max", y_mean="mean", y_std="std")
    .reset_index()
)
flat_by_std = stat["y_std"].fillna(0.0) <= 1e-6
print("[CHECK] flat series by std<=1e-6:", int(flat_by_std.sum()), "/", len(stat))

print("\nDONE.")

NUM_FEATURES (from schema): 18
CAT_FEATURES (from schema): ['country', 'sku']
NUM_FEATURES: 18
Lagi: ['lag_1', 'lag_13', 'lag_2', 'lag_4', 'lag_8']
Rolle: ['roll_mean_13', 'roll_mean_4', 'roll_mean_8', 'roll_std_13', 'roll_std_4', 'roll_std_8']
Series missing in df_hist (features): 6
forecast_ml rows: 468
forecast_ml model_type counts:
 model_type
ml_recursive    468
Name: count, dtype: int64
Ile serii ML płaskich: 0 na 9
Series OTHER in df_hist: 5
bb_other rows: 260
Saved: C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\forecast_future.parquet (1040, 10)
model_type counts:
 model_type
other_hybrid    572
ml_recursive    468
Name: count, dtype: int64
Serie w forecast_all: 20
SKU w forecast_all: 20
Tygodnie 2026: 52 2026-01-05 00:00:00 2026-12-28 00:00:00

[CHECK] Duplicate rows on (country,dc_id,sku,date): 0
[CHECK] max rows per (country,sku,date): 1
[CHECK] PI10 > PI90 rows: 0
[CHECK] negative y_pred rows: 0
[CHECK] share of y_pred == 0: 0.0
[CHECK] flat series by std<=1e

In [279]:
flat = (forecast_all.groupby(["country","sku","model_type"])["y_pred"]
        .agg(std="std", n="size").reset_index())
flat["is_flat"] = flat["std"].fillna(0.0) <= 1e-6

print(flat[flat["is_flat"]].sort_values(["model_type","country","sku"]))
print("\nFlat ML:", flat[(flat["is_flat"]) & (flat["model_type"]=="ml_recursive")].shape[0])

Empty DataFrame
Columns: [country, sku, model_type, std, n, is_flat]
Index: []

Flat ML: 0


In [280]:
hist52 = (
    df_hist.sort_values(["country","sku","week_start"])
    .groupby(["country","sku"], observed=True)
    .tail(52)
)

hist_stats = (hist52.groupby(["country","sku"], observed=True)[TARGET_RAW]
              .agg(hist_mean_52="mean", hist_last="last")
              .reset_index())

fc_stats = (forecast_all.groupby(["country","sku"], observed=True)["y_pred"]
            .agg(fc_mean="mean").reset_index())

diag = fc_stats.merge(hist_stats, on=["country","sku"], how="left")
diag["ratio"] = diag["fc_mean"] / diag["hist_mean_52"].replace(0, np.nan)

print(diag.sort_values("ratio").head(10))
print(diag.sort_values("ratio", ascending=False).head(10))

     country            sku      fc_mean  hist_mean_52  hist_last     ratio
13     Spain  00023-189-000    27.517121    266.309615     96.936  0.103328
7   Portugal  00041-097-000     7.861929     27.076923     12.000  0.290355
10   Romania  05243-115-000    69.486966    217.512865    181.000  0.319461
6   Portugal  00012-619-000  3320.464326   7668.401923   2910.000  0.433006
8    Romania  00012-432-000   516.344935   1061.134615    638.000  0.486597
17    Sweden  00295-629-000   119.721763    224.615385    259.000  0.533008
9    Romania  00077-010-000   499.392370    632.811538    357.000  0.789164
5     Poland  62170-027-000    41.568539     46.250000     27.000  0.898779
1    Germany  00019-003-003  8200.215488   8682.346154   5571.000  0.944470
2     Poland  02589-489-000  9111.899331   9232.557692   5821.000  0.986931
    country            sku      fc_mean  hist_mean_52  hist_last     ratio
16   Sweden  00011-294-000  3060.407259   2873.807692     2120.0  1.064931
15    Spain  0

In [281]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("data")

RAW_PATH = DATA_DIR / "master_raw.parquet"
OUT_PATH = DATA_DIR / "master_model_ready.parquet"

raw = pd.read_parquet(RAW_PATH)

# wymagane minimum do dashboardu
raw["week_start"] = pd.to_datetime(raw["week_start"], errors="coerce")
raw = raw[raw["week_start"].notna()].copy()

# demand_raw musi istnieć
if "demand_raw" not in raw.columns:
    raise ValueError("master_raw.parquet musi mieć kolumnę demand_raw")

raw["demand_raw"] = pd.to_numeric(raw["demand_raw"], errors="coerce").fillna(0.0)

# ujednolicenie do formatu dashboardu
out = raw.rename(columns={"week_start": "date"}).copy()

# kolumny, które dashboard lubi mieć (część jest opcjonalna)
if "product_name" not in out.columns:
    out["product_name"] = out["sku"].astype(str)
if "dc_id" not in out.columns:
    out["dc_id"] = "ALL"

# minimalny zestaw + kilka przydatnych
keep = [c for c in ["country", "sku", "product_name", "dc_id", "date", "demand_raw"] if c in out.columns]
out = out[keep].sort_values(["country", "sku", "date"])

out.to_parquet(OUT_PATH, index=False)

print("Saved:", OUT_PATH.resolve())
print("Rows:", len(out), "Series:", out[["country","sku"]].drop_duplicates().shape[0])
print("Date range:", out["date"].min(), "->", out["date"].max())

Saved: C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\master_model_ready.parquet
Rows: 18966 Series: 20
Date range: 2018-12-31 00:00:00 -> 2025-11-03 00:00:00


In [282]:
from pathlib import Path

DATA_DIR = Path("data")
need = [
    "forecast_future.parquet",
    "master_model_ready.parquet",   # historia
    "forecast_backtest.parquet",    # jeśli chcesz backtest tab
]

for f in need:
    p = DATA_DIR / f
    print(f, "->", "OK" if p.exists() else "BRAK", "|", p.resolve())

forecast_future.parquet -> OK | C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\forecast_future.parquet
master_model_ready.parquet -> OK | C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\master_model_ready.parquet
forecast_backtest.parquet -> OK | C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\forecast_backtest.parquet


In [283]:
# --- DIAGNOSTYKA: które serie są płaskie i jaki mają model_type ---
fc = forecast_all.copy()

flat = (
    fc.groupby(["country","sku","model_type"], observed=True)["y_pred"]
      .agg(std="std", min="min", max="max", n="size")
      .reset_index()
)

flat["is_flat"] = flat["std"].fillna(0.0) <= 1e-6

print("Flat series count:", flat["is_flat"].sum(), "/", len(flat))
display(flat.sort_values(["is_flat","std"], ascending=[False, True]).head(30))

Flat series count: 0 / 20


Unnamed: 0,country,sku,model_type,std,min,max,n,is_flat
11,Romania,07808-016-000,ml_recursive,0.532951,1.267461,3.205173,52,False
12,Romania,76518-000-000,other_hybrid,1.460248,0.80931,9.786552,52,False
7,Portugal,00041-097-000,other_hybrid,2.927641,3.416812,15.462899,52,False
5,Poland,62170-027-000,ml_recursive,6.868571,27.254284,69.558327,52,False
4,Poland,16333-000-000,other_hybrid,7.262334,13.199144,49.01068,52,False
17,Sweden,00295-629-000,other_hybrid,10.970135,99.735874,143.428161,52,False
13,Spain,00023-189-000,other_hybrid,15.445286,7.780235,61.716255,52,False
8,Romania,00012-432-000,ml_recursive,18.552101,470.418816,544.334768,52,False
18,Sweden,00397-117-000,other_hybrid,21.220021,136.365928,233.335244,52,False
10,Romania,05243-115-000,other_hybrid,41.44441,17.86069,173.561379,52,False


In [284]:
# --- Czy ML też bywa płaski? ---
flat_ml = flat[(flat["is_flat"]) & (flat["model_type"] == "ml_recursive")]
print("Flat ML series:", len(flat_ml))
display(flat_ml)

Flat ML series: 0


Unnamed: 0,country,sku,model_type,std,min,max,n,is_flat


In [285]:
# === QUICK DIAG: skala forecast vs historia (ostatnie 52 tyg.) ===
hist52 = (
    df_hist.sort_values(["country","sku","week_start"])
    .groupby(["country","sku"], observed=True)
    .tail(52)
)

hist_stats = (hist52.groupby(["country","sku"], observed=True)[TARGET_RAW]
              .agg(hist_mean_52="mean", hist_last="last")
              .reset_index())

fc_stats = (forecast_ml.groupby(["country","sku"], observed=True)["y_pred"]
            .agg(fc_mean="mean", fc_min="min", fc_max="max", fc_std="std")
            .reset_index())

diag = fc_stats.merge(hist_stats, on=["country","sku"], how="left")
diag["ratio_fc_to_hist_mean"] = diag["fc_mean"] / diag["hist_mean_52"].replace(0, np.nan)

print("TOP najniższe ratio (forecast za mały vs historia):")
display(diag.sort_values("ratio_fc_to_hist_mean", ascending=True).head(10))

print("TOP najwyższe ratio (forecast za duży vs historia):")
display(diag.sort_values("ratio_fc_to_hist_mean", ascending=False).head(10))

TOP najniższe ratio (forecast za mały vs historia):


Unnamed: 0,country,sku,fc_mean,fc_min,fc_max,fc_std,hist_mean_52,hist_last,ratio_fc_to_hist_mean
3,Romania,00012-432-000,516.344935,470.418816,544.334768,18.552101,1061.134615,638.0,0.486597
4,Romania,00077-010-000,499.39237,200.454958,546.724873,59.325706,632.811538,357.0,0.789164
2,Poland,62170-027-000,41.568539,27.254284,69.558327,6.868571,46.25,27.0,0.898779
0,Germany,00019-003-003,8200.215488,6156.568353,9071.467404,542.550197,8682.346154,5571.0,0.94447
1,Poland,02589-489-000,9111.899331,8135.922251,9892.266811,401.316536,9232.557692,5821.0,0.986931
5,Romania,07808-016-000,2.198854,1.267461,3.205173,0.532951,2.211538,2.0,0.994265
8,Sweden,75-072-000,2512.879573,2149.576578,3161.880183,265.730868,2503.615385,2528.0,1.0037
6,Spain,04592-030-000,1549.375574,1138.049996,1808.436296,170.098024,1541.817308,635.0,1.004902
7,Sweden,00011-294-000,3060.407259,2034.595135,3900.475695,410.605175,2873.807692,2120.0,1.064931


TOP najwyższe ratio (forecast za duży vs historia):


Unnamed: 0,country,sku,fc_mean,fc_min,fc_max,fc_std,hist_mean_52,hist_last,ratio_fc_to_hist_mean
7,Sweden,00011-294-000,3060.407259,2034.595135,3900.475695,410.605175,2873.807692,2120.0,1.064931
6,Spain,04592-030-000,1549.375574,1138.049996,1808.436296,170.098024,1541.817308,635.0,1.004902
8,Sweden,75-072-000,2512.879573,2149.576578,3161.880183,265.730868,2503.615385,2528.0,1.0037
5,Romania,07808-016-000,2.198854,1.267461,3.205173,0.532951,2.211538,2.0,0.994265
1,Poland,02589-489-000,9111.899331,8135.922251,9892.266811,401.316536,9232.557692,5821.0,0.986931
0,Germany,00019-003-003,8200.215488,6156.568353,9071.467404,542.550197,8682.346154,5571.0,0.94447
2,Poland,62170-027-000,41.568539,27.254284,69.558327,6.868571,46.25,27.0,0.898779
4,Romania,00077-010-000,499.39237,200.454958,546.724873,59.325706,632.811538,357.0,0.789164
3,Romania,00012-432-000,516.344935,470.418816,544.334768,18.552101,1061.134615,638.0,0.486597


In [286]:
import pandas as pd
import numpy as np

# wybierz problematyczny przykład:
C = "Poland"
SKU = "62170-027-000"   # Chipsy PAPRYKA

# df_hist = features_level_a.parquet (masz to w 09 jako df_hist)
# raw = master_raw.parquet (masz to w 09 jako raw)
# albo jeśli chcesz porównać do master_model_ready.parquet używanego w Streamlit:
m = pd.read_parquet("data/master_model_ready.parquet")

a = df_hist[(df_hist["country"].astype(str)==C) & (df_hist["sku"].astype(str)==SKU)].copy()
b = raw[(raw["country"].astype(str)==C) & (raw["sku"].astype(str)==SKU)].copy()
c = m[(m["country"].astype(str)==C) & (m["sku"].astype(str)==SKU)].copy()

for name, g, ycol, dcol in [
    ("features_level_a", a, "demand", "week_start"),
    ("master_raw",       b, "demand_raw", "week_start"),
    ("master_model_ready", c, "demand_raw", "date" if "date" in c.columns else "week_start"),
]:
    if g.empty:
        print(name, "EMPTY")
        continue
    g[dcol] = pd.to_datetime(g[dcol], errors="coerce")
    g = g[g[dcol].notna()].sort_values(dcol)
    y = pd.to_numeric(g[ycol], errors="coerce").fillna(0.0)
    print("\n===", name, "===")
    print("rows:", len(g), "date range:", g[dcol].min(), "->", g[dcol].max())
    print("last 5 y:", y.tail(5).tolist())
    print("mean:", float(y.mean()), "median:", float(y.median()), "max:", float(y.max()))


=== features_level_a ===
rows: 129 date range: 2023-05-22 00:00:00 -> 2025-11-03 00:00:00
last 5 y: [31.0, 23.0, 17.0, 37.0, 27.0]
mean: 56.26201550387597 median: 40.0 max: 195.0

=== master_raw ===
rows: 542 date range: 2022-05-23 00:00:00 -> 2025-11-03 00:00:00
last 5 y: [12.0, 9.0, 8.0, 9.0, 10.0]
mean: 21.689298892988926 median: 16.0 max: 103.0

=== master_model_ready ===
rows: 542 date range: 2022-05-23 00:00:00 -> 2025-11-03 00:00:00
last 5 y: [12.0, 16.0, 8.0, 9.0, 10.0]
mean: 21.689298892988926 median: 16.0 max: 103.0


In [287]:
fc = pd.read_parquet("data/forecast_future.parquet")
fc["date"] = pd.to_datetime(fc["date"], errors="coerce")

# bierzemy pierwszy tydzień forecastu
fc1 = fc[fc["date"] == fc["date"].min()].copy()

# last history from features_level_a:
last_feat = (df_hist.sort_values("week_start")
             .groupby(["country","sku"], as_index=False)
             .tail(1)[["country","sku","week_start","demand"]]
             .rename(columns={"week_start":"last_feat_date","demand":"last_feat_demand"}))

tmp = fc1.merge(last_feat, on=["country","sku"], how="left")
tmp["y_pred"] = pd.to_numeric(tmp["y_pred"], errors="coerce")
tmp["ratio_pred_to_feat_last"] = tmp["y_pred"] / tmp["last_feat_demand"].replace(0, np.nan)

print(tmp[["country","sku","y_pred","last_feat_demand","ratio_pred_to_feat_last","model_type"]]
      .sort_values("ratio_pred_to_feat_last")
      .head(20))

     country            sku       y_pred  last_feat_demand  \
10   Romania  05243-115-000    55.930345           181.000   
7   Portugal  00041-097-000     5.309855            12.000   
17    Sweden  00295-629-000   125.965291           259.000   
9    Romania  00077-010-000   200.454958           357.000   
13     Spain  00023-189-000    61.716255            96.936   
6   Portugal  00012-619-000  2121.742439          2910.000   
8    Romania  00012-432-000   508.406974           638.000   
19    Sweden     75-072-000  2350.886823          2528.000   
11   Romania  07808-016-000     2.173945             2.000   
16    Sweden  00011-294-000  2486.219864          2120.000   
1    Germany  00019-003-003  7332.782283          5571.000   
2     Poland  02589-489-000  8300.907959          5821.000   
5     Poland  62170-027-000    45.486956            27.000   
15     Spain  04592-030-000  1419.798037           635.000   
0    Germany  00004-807-019  2173.760566               NaN   
3     Po

In [288]:
m = pd.read_parquet("data/master_model_ready.parquet")
g = m[(m["country"]=="Spain") & (m["sku"]=="00119-066-001")]
print("master_model_ready unique names:")
print(g["product_name"].value_counts())

fc = pd.read_parquet("data/forecast_future.parquet")
g2 = fc[(fc["country"]=="Spain") & (fc["sku"]=="00119-066-001")]
print("\nforecast_future unique names:")
print(g2["product_name"].value_counts())

master_model_ready unique names:
product_name
GAZPACHO (17783-000)    2077
Name: count, dtype: int64

forecast_future unique names:
product_name
GAZPACHO (17783-000)    52
Name: count, dtype: int64


In [289]:
# =========================
# 09 -> ADD BACKTEST FOR MISSING SERIES (OTHER/FALLBACK)
# Produces:
# - data/forecast_backtest.parquet (appended)
# - data/metrics_summary.parquet (optional summary)
# =========================

import numpy as np
import pandas as pd
from pathlib import Path

DATA_DIR = Path("data")
BACKTEST_PATH = DATA_DIR / "forecast_backtest.parquet"
METRICS_PATH  = DATA_DIR / "metrics_summary.parquet"

H = 8  # horizon per cutoff (weeks) - możesz zmienić na 12 jeśli wolisz

def wape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.abs(y_true).sum()
    return np.nan if denom <= 1e-12 else np.abs(y_true - y_pred).sum() / denom

def mae(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.mean(np.abs(y_true - y_pred)))

def bias(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.mean(y_pred - y_true))

def naive_last(y_hist, horizon):
    last = float(y_hist.iloc[-1]) if len(y_hist) else 0.0
    return np.repeat(last, horizon)

def seasonal_naive(y_hist, horizon, season=52):
    if len(y_hist) < season:
        return naive_last(y_hist, horizon)
    start = len(y_hist) - season
    return y_hist.iloc[start:start+horizon].values[:horizon]

def drift(y_hist, horizon):
    if len(y_hist) < 2:
        return naive_last(y_hist, horizon)
    slope = (float(y_hist.iloc[-1]) - float(y_hist.iloc[0])) / (len(y_hist) - 1)
    return float(y_hist.iloc[-1]) + slope * np.arange(1, horizon + 1)

# ---- LOAD EXISTING BACKTEST (if any) ----
if BACKTEST_PATH.exists():
    bt_existing = pd.read_parquet(BACKTEST_PATH)
else:
    bt_existing = pd.DataFrame()

# ---- pick cutoff_dates ----
# Prefer: use same cutoffs as existing backtest (so UI looks consistent)
if not bt_existing.empty and "cutoff_date" in bt_existing.columns:
    cutoff_dates = sorted(pd.to_datetime(bt_existing["cutoff_date"]).dropna().unique())
else:
    # fallback: last 3 cutoffs from master_raw (monthly-ish)
    all_dates = raw["week_start"].dropna().sort_values().unique()
    all_dates = pd.to_datetime(all_dates)
    cutoff_dates = list(pd.Series(all_dates).iloc[-(H*3 + 10):].dropna().unique())
    # pick 3 cutoffs spaced ~4 weeks
    cutoff_dates = sorted(pd.to_datetime(cutoff_dates))[-3:]
cutoff_dates = [pd.to_datetime(d) for d in cutoff_dates]

# ---- determine which series are missing in backtest ----
all_series = raw[["country", "sku"]].drop_duplicates().copy()
all_series["country"] = all_series["country"].astype(str)
all_series["sku"] = all_series["sku"].astype(str)

if not bt_existing.empty:
    existing_series = bt_existing[["country", "sku"]].drop_duplicates().copy()
    existing_series["country"] = existing_series["country"].astype(str)
    existing_series["sku"] = existing_series["sku"].astype(str)
    missing_series = all_series.merge(existing_series, on=["country","sku"], how="left", indicator=True)
    missing_series = missing_series[missing_series["_merge"]=="left_only"][["country","sku"]]
else:
    missing_series = all_series.copy()

print("Missing series for backtest:", len(missing_series))

# ---- baseline map ----
baseline_map = df_base.set_index(["country","sku"])["best_baseline"].astype(str).str.lower().to_dict()

rows = []
raw2 = raw.copy()
raw2["country"] = raw2["country"].astype(str)
raw2["sku"] = raw2["sku"].astype(str)
raw2["week_start"] = pd.to_datetime(raw2["week_start"], errors="coerce")
raw2 = raw2[raw2["week_start"].notna()].copy()
raw2["demand_raw"] = pd.to_numeric(raw2["demand_raw"], errors="coerce").fillna(0.0).clip(lower=0.0)

for (c, sku) in missing_series[["country","sku"]].itertuples(index=False, name=None):
    g = raw2[(raw2["country"]==c) & (raw2["sku"]==sku)].sort_values("week_start")
    if g.empty:
        continue

    y_all = g.set_index("week_start")["demand_raw"].astype(float)

    model = baseline_map.get((c, sku), "naive").strip().lower()

    for cd in cutoff_dates:
        hist = y_all[y_all.index <= cd]
        fut  = y_all[(y_all.index > cd)].head(H)

        if len(hist) < 4 or len(fut) < 1:
            continue

        if model == "seasonal_naive":
            pred = seasonal_naive(hist.reset_index(drop=True), len(fut), season=52)
        elif model == "drift":
            pred = drift(hist.reset_index(drop=True), len(fut))
        else:
            pred = naive_last(hist.reset_index(drop=True), len(fut))

        pred = np.clip(np.asarray(pred, float), 0.0, None)

        for d, yt, yp in zip(fut.index, fut.values, pred):
            rows.append({
                "country": c,
                "dc_id": "ALL",
                "sku": sku,
                "date": pd.to_datetime(d),
                "cutoff_date": pd.to_datetime(cd),
                "y_true": float(yt),
                "y_pred_raw": float(yp),
                "y_pred_cal": float(yp),  # jeśli nie masz kalibracji -> kopia raw
            })

bt_new = pd.DataFrame(rows)
print("New backtest rows:", len(bt_new))

# ---- append + save ----
bt_all = pd.concat([bt_existing, bt_new], ignore_index=True)
bt_all["date"] = pd.to_datetime(bt_all["date"], errors="coerce")
bt_all["cutoff_date"] = pd.to_datetime(bt_all["cutoff_date"], errors="coerce")
bt_all = bt_all.dropna(subset=["date","cutoff_date"])
bt_all = bt_all.sort_values(["country","sku","cutoff_date","date"])
bt_all.to_parquet(BACKTEST_PATH, index=False)
print("Saved:", BACKTEST_PATH.resolve(), bt_all.shape)

# ---- (optional) metrics_summary per series ----
mrows = []
for (c, sku), gg in bt_all.groupby(["country","sku"], observed=True):
    yt = gg["y_true"].to_numpy(float)
    yr = gg["y_pred_raw"].to_numpy(float)
    yc = gg["y_pred_cal"].to_numpy(float)
    mrows.append({
        "country": str(c),
        "sku": str(sku),
        "n_obs": int(len(gg)),
        "WAPE_raw": wape(yt, yr),
        "WAPE_cal": wape(yt, yc),
        "MAE_raw": mae(yt, yr),
        "MAE_cal": mae(yt, yc),
        "Bias_raw": bias(yt, yr),
        "Bias_cal": bias(yt, yc),
    })

metrics_summary = pd.DataFrame(mrows).sort_values(["WAPE_cal","n_obs"], ascending=[True, False])
metrics_summary.to_parquet(METRICS_PATH, index=False)
print("Saved:", METRICS_PATH.resolve(), metrics_summary.shape)

Missing series for backtest: 9
New backtest rows: 182
Saved: C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\forecast_backtest.parquet (734, 8)
Saved: C:\Users\48573\Desktop\Programy_rozwój\TopYoung100\HAVI\data\metrics_summary.parquet (20, 9)
