<a href="https://colab.research.google.com/github/giorgitorro/2D_PlatformGame/blob/main/model_experiment_SARIMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install mlflow==2.2.2



In [2]:
!pip install kaggle



In [3]:
!pip install pmdarima



In [4]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"giorgitoronjadze","key":"9c4aebca47b5575e27b4c70560a74ddf"}'}

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!pip install opendatasets
!pip install dagshub



In [7]:
import opendatasets as od
od.download("https://www.kaggle.com/competitions/walmart-recruiting-store-sales-forecasting/overview")

Skipping, found downloaded files in "./walmart-recruiting-store-sales-forecasting" (use force=True to force download)


In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
import os
from sklearn.model_selection import TimeSeriesSplit
import pickle
from pmdarima import auto_arima

Load training data

In [9]:
train = pd.read_csv('/content/walmart-recruiting-store-sales-forecasting/train.csv.zip')  # Store, Dept, Date, Weekly_Sales, IsHoliday
features = pd.read_csv('/content/walmart-recruiting-store-sales-forecasting/features.csv.zip')  # Store, Date, Temperature, Fuel_Price, CPI, Unemployment, IsHoliday
stores = pd.read_csv('/content/walmart-recruiting-store-sales-forecasting/stores.csv')  # Store, Type, Size

print("Train columns:", train.columns.tolist())
print("Features columns:", features.columns.tolist())
print("Stores columns:", stores.columns.tolist())

Train columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday']
Features columns: ['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday']
Stores columns: ['Store', 'Type', 'Size']


Data Preprocessing

In [10]:
train_merged = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
train_merged = train_merged.merge(stores, on='Store', how='left')

train_merged['Date'] = pd.to_datetime(train_merged['Date'])

train_merged = train_merged.sort_values(['Store', 'Dept', 'Date'])

train_merged.fillna({
    'MarkDown1': 0,
    'MarkDown2': 0,
    'MarkDown3': 0,
    'MarkDown4': 0,
    'MarkDown5': 0,
    'CPI': train_merged['CPI'].mean(),
    'Unemployment': train_merged['Unemployment'].mean()
}, inplace=True)

In [11]:
#unique identifier for each Store-Dept combination
train_merged['Store_Dept'] = train_merged['Store'].astype(str) + '_' + train_merged['Dept'].astype(str)

#exogenous variables
exog_columns = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday', 'Size']
train_exog = train_merged[exog_columns]

In [12]:
import dagshub
dagshub.init(repo_owner='CarlTeapot', repo_name='Walmart-Recruiting', mlflow=True)

In [13]:
mlflow.set_experiment('SARIMA_Training')

with mlflow.start_run(run_name='SARIMA_Preprocessing'):
    mlflow.log_param('exog_columns', exog_columns)
    mlflow.log_param('missing_value_strategy', 'fill_with_zero_or_mean')
    mlflow.log_metric('train_data_rows', len(train_merged))
    mlflow.log_metric('unique_store_dept', train_merged['Store_Dept'].nunique())

2025/07/08 18:59:42 INFO mlflow.tracking.fluent: Experiment with name 'SARIMA_Training' does not exist. Creating a new experiment.


In [14]:
class SARIMAModel:
    def __init__(self, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52), m=52):
        self.order = order
        self.seasonal_order = seasonal_order
        self.m = m
        self.model = None

    def fit(self, X, y):
        try:
            self.model = auto_arima(
                y,
                exogenous=X,
                start_p=0,
                start_q=0,
                max_p=3,
                max_q=3,
                start_P=0,
                start_Q=0,
                max_P=2,
                max_Q=2,
                m=self.m,
                seasonal=(self.m > 1),
                suppress_warnings=True,
                stepwise=True,
                error_action='ignore'  # Ignore errors for problematic fits
            )
        except ValueError as e:
            print(f"Error fitting SARIMA for m={self.m}: {e}")
            # Fallback to non-seasonal ARIMA if seasonal differencing fails
            self.model = auto_arima(
                y,
                exogenous=X,
                start_p=0,
                start_q=0,
                max_p=3,
                max_q=3,
                seasonal=False,
                suppress_warnings=True,
                stepwise=True
            )
        return self

    def predict(self, X):
        return self.model.predict(n_periods=len(X), exogenous=X)

 Train ARIMA Models with Cross-Validation



In [16]:
from sklearn.preprocessing import StandardScaler
import math

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale exogenous variables
    ('sarima', SARIMAModel(order=(1, 1, 1), seasonal_order=(1, 1, 1, 52)))
])
tscv = TimeSeriesSplit(n_splits=5)

# Train and evaluate for each Store_Dept
skipped_store_depts = []
for store_dept in train_merged['Store_Dept'].unique()[:5]:  #only 5 dept for 1st store
    with mlflow.start_run(run_name=f'SARIMA_{store_dept}'):
        train_subset = train_merged[train_merged['Store_Dept'] == store_dept]
        y_train = train_subset['Weekly_Sales'].values
        X_train = train_subset[exog_columns].values
        is_holiday = train_subset['IsHoliday'].values
        if len(y_train) < 52:
            print(f"Skipping Store_Dept {store_dept}: Only {len(y_train)} weeks, need at least 52 for m=52")
            skipped_store_depts.append(store_dept)
            mlflow.log_param('skipped', True)
            mlflow.log_param('reason', f'Insufficient data: {len(y_train)} weeks')
            continue

        mlflow.log_param('store_dept', store_dept)
        mlflow.log_param('order', (1, 1, 1))
        mlflow.log_param('seasonal_order', (1, 1, 1, 52))
        mlflow.log_param('seasonal_period', 52)

        mse_scores = []
        mae_scores = []
        wmae_scores = []
        for train_idx, val_idx in tscv.split(y_train):
            if len(train_idx) < 52:
                print(f"Skipping fold for {store_dept}: Training fold has {len(train_idx)} weeks")
                continue
            X_tr, X_val = X_train[train_idx], X_train[val_idx]
            y_tr, y_val = y_train[train_idx], y_train[val_idx]
            is_holiday_val = is_holiday[val_idx]

            try:
                pipeline.fit(X=X_tr, y=y_tr)
                pred = pipeline.predict(X_val)

                mse = mean_squared_error(y_val, pred)
                mae = mean_absolute_error(y_val, pred)
                weights = np.where(is_holiday_val, 5, 1)  # Holiday weeks have 5x weight
                wmae = np.sum(weights * np.abs(y_val - pred)) / np.sum(weights)
                mse_scores.append(mse)
                mae_scores.append(mae)
                wmae_scores.append(wmae)
            except Exception as e:
                print(f"Error in fold for {store_dept}: {e}")
                continue

        if not mse_scores:
            print(f"No valid folds for {store_dept}")
            mlflow.log_param('skipped', True)
            mlflow.log_param('reason', 'No valid cross-validation folds')
            skipped_store_depts.append(store_dept)
            continue
        avg_mse = np.mean(mse_scores)
        avg_rmse = math.sqrt(avg_mse)
        avg_mae = np.mean(mae_scores)
        avg_wmae = np.mean(wmae_scores)
        mlflow.log_metric('avg_mse', avg_mse)
        mlflow.log_metric('avg_rmse', avg_rmse)
        mlflow.log_metric('avg_mae', avg_mae)
        mlflow.log_metric('avg_wmae', avg_wmae)
        print(f'Store_Dept: {store_dept}')
        print(f'Average MSE: {avg_mse}')
        print(f'Average RMSE: {avg_rmse}')
        print(f'Average MAE: {avg_mae}')
        print(f'Average WMAE: {avg_wmae}')

        pipeline.fit(X=X_train, y=y_train)

        pipeline_path = f'model_SARIMA_{store_dept}.pkl'
        with open(pipeline_path, 'wb') as f:
            pickle.dump(pipeline, f)
        mlflow.log_artifact(pipeline_path)

        mlflow.sklearn.log_model(pipeline, f'sarima_pipeline_{store_dept}')

# Log skipped Store_Dept combinations
if skipped_store_depts:
    with open('skipped_store_depts.txt', 'w') as f:
        f.write('\n'.join(skipped_store_depts))
    mlflow.log_artifact('skipped_store_depts.txt')

Skipping fold for 1_1: Training fold has 28 weeks
Skipping fold for 1_1: Training fold has 51 weeks




Store_Dept: 1_1
Average MSE: 51627322.604190744
Average RMSE: 7185.215557253015
Average MAE: 4791.498442091329
Average WMAE: 4410.024201579198




Skipping fold for 1_2: Training fold has 28 weeks
Skipping fold for 1_2: Training fold has 51 weeks




Store_Dept: 1_2
Average MSE: 5450142.515697577
Average RMSE: 2334.5540292950122
Average MAE: 1732.6853900255749
Average WMAE: 2109.429351500457




Skipping fold for 1_3: Training fold has 28 weeks
Skipping fold for 1_3: Training fold has 51 weeks




Store_Dept: 1_3
Average MSE: 18716141.495028988
Average RMSE: 4326.215608939178
Average MAE: 1873.8405794572245
Average WMAE: 1912.3585940193734




Skipping fold for 1_4: Training fold has 28 weeks
Skipping fold for 1_4: Training fold has 51 weeks




Store_Dept: 1_4
Average MSE: 4249055.464814525
Average RMSE: 2061.3237166477575
Average MAE: 1601.996316071636
Average WMAE: 1607.861407135102




Skipping fold for 1_5: Training fold has 28 weeks
Skipping fold for 1_5: Training fold has 51 weeks




Store_Dept: 1_5
Average MSE: 82331147.68206201
Average RMSE: 9073.651287219605
Average MAE: 5336.278591736364
Average WMAE: 7051.696233173437


