<a href="https://colab.research.google.com/github/eghib22/Store-Sales-Forecasting/blob/main/model_experiment_SARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle
from google.colab import files
files.upload()
!mv "kaggle.json" ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!ls -l ~/.kaggle/

!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip walmart-recruiting-store-sales-forecasting
!unzip '*.csv.zip'
!pip install -q dagshub mlflow scikit-learn joblib


In [None]:
import pandas as pd
import numpy as np
import gc
import mlflow
import dagshub

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib

import warnings
warnings.filterwarnings("ignore")

dagshub.init(repo_owner='eghib22', repo_name='Store-Sales-Forecasting', mlflow=True)
mlflow.set_experiment("SARIMAX_Modeling")

train = pd.read_csv('train.csv')
features = pd.read_csv('features.csv')
stores = pd.read_csv('stores.csv')

train['Date'] = pd.to_datetime(train['Date'])
features['Date'] = pd.to_datetime(features['Date'])

df = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
df = df.merge(stores, on='Store', how='left')

df['Temperature'] = df['Temperature'].fillna(method='ffill').fillna(method='bfill')
df['Fuel_Price'] = df['Fuel_Price'].fillna(method='ffill').fillna(method='bfill')
df['CPI'] = df['CPI'].fillna(method='ffill').fillna(method='bfill')
df['Unemployment'] = df['Unemployment'].fillna(method='ffill').fillna(method='bfill')

df['Month'] = df['Date'].dt.month
df['WeekOfYear'] = df['Date'].dt.isocalendar().week
df['HolidayWeek'] = df.groupby(['Store', 'Dept'])['IsHoliday'].transform('max')
exog_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Month', 'WeekOfYear', 'HolidayWeek']

print(f"Using exogenous features: {exog_cols}")

def weighted_mae(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)


In [None]:
class SARIMAXWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, order=(1,1,1), seasonal_order=(1,1,1,52)):
        self.order = order
        self.seasonal_order = seasonal_order
        self.model_ = None

    def fit(self, X, y):
        self.model_ = SARIMAX(
            y,
            exog=X,
            order=self.order,
            seasonal_order=self.seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        ).fit(disp=False)
        return self

    def predict(self, X):
        pred = self.model_.predict(start=X.index[0], end=X.index[-1], exog=X)
        return pred


In [None]:
results = []
all_preds = []

store_dept_groups = df.groupby(['Store', 'Dept'])
total_groups = len(store_dept_groups)

print(f"--- Starting SARIMAX for {total_groups} Store-Dept combos ---")

with mlflow.start_run(run_name="SARIMAX_Improved_CV_Run"):
    mlflow.log_param("SARIMAX_order", "(1,1,2)")
    mlflow.log_param("SARIMAX_seasonal_order", "(1,1,1,52)")
    mlflow.log_param("exogenous_features", exog_cols)

    for idx, ((store_id, dept_id), group) in enumerate(store_dept_groups, start=1):
        print(f"\n--- Processing Store: {store_id}, Dept: {dept_id} ({idx}/{total_groups}) ---")

        g = group.sort_values('Date').set_index('Date')
        y = g['Weekly_Sales']
        X = g[exog_cols].copy()
        weights = g['IsHoliday'].apply(lambda x: 5 if x else 1)

        y_train = y[y.index < '2012-01-01']
        y_val = y[(y.index >= '2012-01-01') & (y.index < '2012-07-01')]
        X_train = X.loc[y_train.index].copy()
        X_val = X.loc[y_val.index].copy()
        weights_val = weights.loc[y_val.index]

        if len(y_train) < 100 or len(y_val) < 20:
            print(f"   Skipped: Not enough data (Train: {len(y_train)}, Val: {len(y_val)})")
            continue

        try:
            X_train = X_train.astype('float64')
            X_val = X_val.astype('float64')

            pipe = Pipeline([
                ('model', SARIMAXWrapper(order=(1,1,2), seasonal_order=(1,1,1,52)))
            ])

            pipe.fit(X_train, y_train)
            pred = pipe.predict(X_val)

            wmae = weighted_mae(y_val, pred, weights_val)
            rmse = np.sqrt(mean_squared_error(y_val, pred))

            print(f"   WMAE: {wmae:.2f} | RMSE: {rmse:.2f}")

            results.append({
                'Store': store_id,
                'Dept': dept_id,
                'RMSE': rmse,
                'WMAE': wmae
            })

            all_preds.append(pd.DataFrame({
                'y_true': y_val.values,
                'y_pred': pred.values,
                'weight': weights_val.values
            }))

        except Exception as e:
            print(f"   Failed: {e}")
            continue

        gc.collect()

    if len(all_preds) > 0:
        all_df = pd.concat(all_preds)
        overall_wmae = np.sum(all_df['weight'] * np.abs(all_df['y_true'] - all_df['y_pred'])) / np.sum(all_df['weight'])
        print(f"\n Overall WMAE: {overall_wmae:.2f}")
        mlflow.log_metric('Overall_WMAE', overall_wmae)
    else:
        print("No valid predictions generated.")

    mlflow.log_metric('total_groups_processed', len(results))

    best_model = Pipeline([
        ('model', SARIMAXWrapper(order=(1,1,2), seasonal_order=(1,1,1,52)))
    ])
    joblib.dump(best_model, "sarimax_pipeline.pkl")
    mlflow.log_artifact("sarimax_pipeline.pkl")

    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/sarimax_results.csv', index=False)

    print(results_df.head())

print("Done. Model and results saved.")
mlflow.end_run()
