<a href="https://colab.research.google.com/github/eghib22/Store-Sales-Forecasting/blob/main/model_experiment_prophet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle
from google.colab import files
files.upload()
!mv "kaggle.json" ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!ls -l ~/.kaggle/

!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip walmart-recruiting-store-sales-forecasting
!unzip '*.csv.zip'
!unzip '*.csv.zip'

!pip install prophet
!pip install -q dagshub mlflow scikit-learn joblib


In [None]:
import dagshub
dagshub.init(repo_owner='eghib22', repo_name='Store-Sales-Forecasting', mlflow=True)

import mlflow
import logging
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
logging.getLogger("prophet").setLevel(logging.WARNING)

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import gc

from prophet import Prophet
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import joblib

train = pd.read_csv('train.csv')
features = pd.read_csv('features.csv')
stores = pd.read_csv('stores.csv')

train['Date'] = pd.to_datetime(train['Date'])
features['Date'] = pd.to_datetime(features['Date'])

df = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
df = df.merge(stores, on='Store', how='left')
df = df.sort_values('Date')

for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']:
    df[col] = df[col].fillna(method='ffill').fillna(method='bfill')

def weighted_mae(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)


In [None]:
class ProphetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False, changepoint_prior_scale=0.5):
        self.yearly_seasonality = yearly_seasonality
        self.weekly_seasonality = weekly_seasonality
        self.daily_seasonality = daily_seasonality
        self.changepoint_prior_scale = changepoint_prior_scale
        self.model_ = None

    def fit(self, X, y=None):
        self.model_ = Prophet(
            yearly_seasonality=self.yearly_seasonality,
            weekly_seasonality=self.weekly_seasonality,
            daily_seasonality=self.daily_seasonality,
            changepoint_prior_scale=self.changepoint_prior_scale
        )
        self.model_.fit(X)
        return self

    def predict(self, X):
        forecast = self.model_.predict(X)
        return forecast['yhat'].values


In [None]:
results = []
all_preds = []

store_dept_groups = df.groupby(['Store', 'Dept'])
total_groups = len(store_dept_groups)

print(f"--- Starting Prophet for {total_groups} Store-Dept combos ---")

mlflow.set_experiment("Prophet_Forecasting")

with mlflow.start_run(run_name="Prophet_Improved_Seasonality_Run"):
    mlflow.log_param("model", "Prophet")
    mlflow.log_param("seasonality_yearly", True)
    mlflow.log_param("seasonality_weekly", True)
    mlflow.log_param("seasonality_daily", False)
    mlflow.log_param("changepoint_prior_scale", 0.5)

    for idx, ((store_id, dept_id), group) in enumerate(store_dept_groups, start=1):
        print(f"\n--- Processing Store: {store_id}, Dept: {dept_id} ({idx}/{total_groups}) ---")

        g = group.sort_values('Date').copy()
        g['ds'] = g['Date']
        g['y'] = g['Weekly_Sales']
        weights = g['IsHoliday'].apply(lambda x: 5 if x else 1)

        y_train = g[g['ds'] < '2012-01-01']
        y_val = g[(g['ds'] >= '2012-01-01') & (g['ds'] < '2012-07-01')]
        weights_val = weights.loc[y_val.index]

        if len(y_train) < 100 or len(y_val) < 20:
            print(f"   Skipped: Not enough data ({len(y_train)} train, {len(y_val)} val)")
            continue

        try:
            pipe = Pipeline([
                ('model', ProphetWrapper(
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=False,
                    changepoint_prior_scale=0.5
                ))
            ])

            pipe.fit(y_train[['ds', 'y']])
            y_pred = pipe.predict(y_val[['ds']])

            y_true = y_val['y'].values
            wmae = weighted_mae(y_true, y_pred, weights_val)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))

            print(f"   WMAE: {wmae:.2f} | RMSE: {rmse:.2f}")

            results.append({
                'Store': store_id,
                'Dept': dept_id,
                'RMSE': rmse,
                'WMAE': wmae
            })

            all_preds.append(pd.DataFrame({
                'y_true': y_true,
                'y_pred': y_pred,
                'weight': weights_val.values
            }))

        except Exception as e:
            print(f"   Failed: {e}")
            continue

        gc.collect()

    if len(all_preds) > 0:
        all_df = pd.concat(all_preds)
        overall_wmae = np.sum(all_df['weight'] * np.abs(all_df['y_true'] - all_df['y_pred'])) / np.sum(all_df['weight'])
        print(f"\n Overall WMAE: {overall_wmae:.2f}")
        mlflow.log_metric("Overall_WMAE", overall_wmae)
    else:
        print("No valid predictions generated.")

    mlflow.log_metric("Total_StoreDept_Models", len(results))

    best_pipeline = Pipeline([
        ('model', ProphetWrapper(
            yearly_seasonality=True,
            weekly_seasonality=True,
            daily_seasonality=False,
            changepoint_prior_scale=0.5
        ))
    ])

    mlflow.sklearn.log_model(best_pipeline, artifact_path="Prophet_Pipeline")

    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/prophet_results.csv', index=False)

    print(results_df.head())

print("Done. Model logged and saved.")
mlflow.end_run()
