In [None]:
# --- Cell 1: imports ---
from src import etl
from src.modeling import fit_sarimax, forecast_sarimax
from src.evaluation import rolling_backtest, mape

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# --- Cell 2: load processed dataset produced in Step 5 ---
df = etl.load_processed("data/processed/model_ready.csv")
print(df.shape)
df.head()

In [None]:
# Fix missing calendar features on the fly
from src.features import add_calendar_features

# If your processed file lacks calendar columns, add them now
if not {"dayofweek", "month", "is_weekend"}.issubset(df.columns):
    # If simulate.py saved a 'dow' column, map it to 'dayofweek' first
    if "dow" in df.columns and "dayofweek" not in df.columns:
        df = df.copy()
        df["dayofweek"] = df["dow"]
        df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
        df["month"] = df["ds"].dt.month
    else:
        # Otherwise, compute all calendar features directly
        df = add_calendar_features(df)

df[["ds","y","promo","dayofweek","month","is_weekend"]].head()

In [None]:
# --- Cell 3: choose exogenous features ---
feature_cols = ["promo", "is_weekend", "dayofweek", "month"]
target = "y"

# (Optional) ensure these columns exist
missing = [c for c in feature_cols if c not in df.columns]
assert not missing, f"Missing columns: {missing}"

df_model = df.dropna().reset_index(drop=True)  # SARIMAX can't handle NaNs
print(df_model[[target] + feature_cols].head())

In [None]:
# --- Cell 4: rolling backtest with SARIMAX ---
HORIZON = 14     # forecast 14 days ahead each fold
FOLDS   = 6

bt_sarimax = rolling_backtest(
    df=df_model,
    target=target,
    feature_cols=feature_cols,
    folds=FOLDS,
    horizon=HORIZON,
    model_fn=lambda y, X, **k: fit_sarimax(y, exog=X, **k),   # <— returns fitted model
    model_kwargs={"order": (1,1,1), "seasonal_order": (1,1,1,7)}
)

bt_sarimax

In [None]:
# --- Cell 5: summarize backtest ---
print("SARIMAX backtest results:")
display(bt_sarimax)

print("\nMAPE mean:", bt_sarimax["MAPE"].mean().round(3))
print("MAE  mean:", bt_sarimax["MAE"].mean().round(3))

In [None]:
# --- Cell 6: seasonal-naive baseline for comparison ---
def seasonal_naive_forecast(train_y, horizon, season=7):
    last_season = train_y.iloc[-season:].values
    reps = int(np.ceil(horizon/season))
    return np.tile(last_season, reps)[:horizon]

def backtest_seasonal_naive(df, target, horizon=14, folds=6, season=7):
    n = len(df)
    fold_size = (n - horizon) // folds
    rows = []
    for i in range(folds):
        train_end = (i+1)*fold_size
        train = df.iloc[:train_end]
        test  = df.iloc[train_end:train_end+horizon]
        if len(test) < horizon: break
        preds = seasonal_naive_forecast(train[target], horizon=horizon, season=season)
        rows.append({
            "fold": i+1,
            "MAE": np.mean(np.abs(test[target].values - preds)),
            "MAPE": mape(test[target].values, preds),
        })
    return pd.DataFrame(rows)

bt_naive = backtest_seasonal_naive(df_model, target=target, horizon=HORIZON, folds=FOLDS, season=7)
print("Seasonal-naive backtest:")
display(bt_naive)
print("\nMAPE mean:", bt_naive["MAPE"].mean().round(3))

In [None]:
# --- Cell 7: fit on full history and forecast next 14 days ---
y_all = df_model[target]
X_all = df_model[feature_cols]

# For the next horizon, build exog by repeating the last known exog pattern (simple demo).
# In production you'd create real future exog (e.g., calendar/holiday/promo plan).
future_index = pd.date_range(df_model["ds"].iloc[-1] + pd.Timedelta(days=1), periods=HORIZON, freq="D")
# naive future exog: copy last row's exog; for calendar vars, recompute.
future_exog = pd.DataFrame(index=range(HORIZON))
future_exog["dayofweek"]  = future_index.dayofweek
future_exog["month"]      = future_index.month
future_exog["is_weekend"] = (future_exog["dayofweek"] >= 5).astype(int)
future_exog["promo"]      = 0  # set to 1 on dates you want to simulate promotions

fitted = fit_sarimax(y_all, exog=X_all, order=(1,1,1), seasonal_order=(1,1,1,7))
pred   = forecast_sarimax(fitted, exog_future=future_exog, steps=HORIZON)

# Plot actual + forecast
plt.figure(figsize=(12,4))
plt.plot(df_model["ds"], y_all, label="actual")
plt.plot(future_index, pred.values, label="forecast")
plt.title(f"SARIMAX {HORIZON}-day forecast")
plt.legend()
plt.tight_layout()
plt.show()

pd.DataFrame({"ds": future_index, "forecast": pred.values}).head()

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
# assumes you just created `fig` or plotted with plt.* in the previous cell
import os
os.makedirs("reports/figures", exist_ok=True)

plt.figure(figsize=(12,4))
plt.plot(df_model["ds"], y_all, label="actual")
plt.plot(future_index, pred.values, label="forecast")
plt.title(f"SARIMAX {HORIZON}-day forecast")
plt.legend(); plt.tight_layout()

plot_path = "reports/figures/sarimax_14day_forecast.png"
plt.savefig(plot_path, dpi=150, bbox_inches="tight")
print("Saved figure ->", plot_path)

In [None]:
import pandas as pd

fcst_df = pd.DataFrame({"ds": future_index, "forecast": pred.values})
os.makedirs("reports/tables", exist_ok=True)
csv_path = "reports/tables/forecast_14day_sarimax.csv"
fcst_df.to_csv(csv_path, index=False)
print("Saved table ->", csv_path)

In [None]:
metrics_path = "reports/tables/backtest_sarimax.csv"
bt_sarimax.to_csv(metrics_path, index=False)

summary = {
    "folds": int(bt_sarimax.shape[0]),
    "horizon_days": int(HORIZON),
    "mape_mean": float(bt_sarimax["MAPE"].mean()),
    "mae_mean": float(bt_sarimax["MAE"].mean())
}
import json, pathlib
pathlib.Path("reports/metrics").mkdir(parents=True, exist_ok=True)
with open("reports/metrics/sarimax_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

summary