In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the cleaned dataset
df = pd.read_csv("../data/processed/train_clean.csv", parse_dates=["date"])

# sanity check
print(df.shape)
print(df.head())

(913000, 7)
        date  store  item  sales  year  month  dayofweek
0 2013-01-01      1     1     13  2013      1          1
1 2013-01-02      1     1     11  2013      1          2
2 2013-01-03      1     1     14  2013      1          3
3 2013-01-04      1     1     13  2013      1          4
4 2013-01-05      1     1     10  2013      1          5


In [None]:
def time_series_for_store_item(data, store_id, item_id):
    s = (
        data[(data["store"] == store_id) & (data["item"] == item_id)]
        .sort_values("date")
        .reset_index(drop=True)
    )
    return s[["date", "sales"]]

def train_test_split_time(series_df, test_size=0.2):
    """Split the series by time: last `test_size` fraction is the test set."""
    n = len(series_df)
    split_idx = int(n * (1 - test_size))
    train = series_df.iloc[:split_idx].copy()
    test = series_df.iloc[split_idx:].copy()
    return train, test

def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def evaluate_forecast(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape_val = mape(y_true, y_pred)
    print(f"{name:22s} MAE={mae:7.2f}  RMSE={rmse:7.2f}  MAPE={mape_val:7.2f}")
    return {"model": name, "mae": mae, "rmse": rmse, "mape": mape_val}
