In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [5]:
train_csv = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
train_csv.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [15]:
train_csv['Dept'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 40, 41, 42, 44, 45, 46, 47, 48, 49, 51, 52, 54, 55, 56,
       58, 59, 60, 67, 71, 72, 74, 77, 78, 79, 80, 81, 82, 83, 85, 87, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99, 39, 50, 43, 65])

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

def weighted_mean_absolute_error(y_true, y_pred, weights):
    
    weights = np.array(weights)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    if np.sum(weights) == 0:
        return np.mean(np.abs(y_true - y_pred))
    
    wmae = np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)
    return wmae

class WalmartSARIMAForecaster:
    def __init__(self):
        self.models = {}
        self.forecasts = {}
        self.holiday_weights = {}
        
    def load_and_preprocess_data(self, train_path, test_path, stores_path=None, features_path=None):
        
        self.train_data = pd.read_csv(train_path)
        self.test_data = pd.read_csv(test_path)
        
        if stores_path:
            self.stores_data = pd.read_csv(stores_path)
        if features_path:
            self.features_data = pd.read_csv(features_path)
            self.features_data['Date'] = pd.to_datetime(self.features_data['Date'])
            self._process_holiday_weights()
        
        self.train_data['Date'] = pd.to_datetime(self.train_data['Date'])
        self.test_data['Date'] = pd.to_datetime(self.test_data['Date'])
        
        self.train_data = self.train_data.sort_values(['Store', 'Dept', 'Date'])
        self.test_data = self.test_data.sort_values(['Store', 'Dept', 'Date'])
        
        print(f"Train data shape: {self.train_data.shape}")
        print(f"Test data shape: {self.test_data.shape}")
        
        self.train_data['Weekly_Sales'] = self.train_data['Weekly_Sales'].fillna(0)
        
        self.train_data['Store_Dept'] = self.train_data['Store'].astype(str) + '_' + self.train_data['Dept'].astype(str)
        self.test_data['Store_Dept'] = self.test_data['Store'].astype(str) + '_' + self.test_data['Dept'].astype(str)
        
        self._add_holiday_weights_to_data()
        
        return self.train_data, self.test_data
    
    def _process_holiday_weights(self):
        if not hasattr(self, 'features_data'):
            return
            
        for _, row in self.features_data.iterrows():
            date_key = (row['Store'], row['Date'])
            is_holiday = row['IsHoliday'] if 'IsHoliday' in row else False
            self.holiday_weights[date_key] = 5 if is_holiday else 1
    
    def _add_holiday_weights_to_data(self):
        self.train_data['Weight'] = self.train_data.apply(
            lambda row: self.holiday_weights.get((row['Store'], row['Date']), 1), axis=1
        )
        
        self.test_data['Weight'] = self.test_data.apply(
            lambda row: self.holiday_weights.get((row['Store'], row['Date']), 1), axis=1
        )
    
    def explore_data(self):
       
        print("Data Exploration:")
        print(f"Date range: {self.train_data['Date'].min()} to {self.train_data['Date'].max()}")
        print(f"Number of stores: {self.train_data['Store'].nunique()}")
        print(f"Number of departments: {self.train_data['Dept'].nunique()}")
        print(f"Number of store-department combinations: {self.train_data['Store_Dept'].nunique()}")
        
        plt.figure(figsize=(15, 8))
        
        weekly_total = self.train_data.groupby('Date')['Weekly_Sales'].sum().reset_index()
        
        plt.subplot(2, 2, 1)
        plt.plot(weekly_total['Date'], weekly_total['Weekly_Sales'])
        plt.title('Total Weekly Sales Over Time')
        plt.xlabel('Date')
        plt.ylabel('Weekly Sales')
        plt.xticks(rotation=45)
        
        plt.subplot(2, 2, 2)
        plt.hist(self.train_data['Weekly_Sales'], bins=30, alpha=0.7)
        plt.title('Distribution of Weekly Sales')
        plt.xlabel('Weekly Sales')
        plt.ylabel('Frequency')
        plt.xlim(0, 300000)
        
        plt.subplot(2, 2, 3)
        store_sales = self.train_data.groupby('Store')['Weekly_Sales'].sum().sort_values(ascending=False).head(10)
        plt.bar(range(len(store_sales)), store_sales.values)
        plt.title('Top 10 Stores by Total Sales')
        plt.xlabel('Store Rank')
        plt.ylabel('Total Sales')
        
        plt.subplot(2, 2, 4)
        dept_sales = self.train_data.groupby('Dept')['Weekly_Sales'].sum().sort_values(ascending=False).head(10)
        plt.bar(range(len(dept_sales)), dept_sales.values)
        plt.title('Top 10 Departments by Total Sales')
        plt.xlabel('Department Rank')
        plt.ylabel('Total Sales')
        
        plt.tight_layout()
        plt.show()

    
    def plot_decomposition(self, timeseries, title):
        plt.figure(figsize=(15, 10))
        decomposition = seasonal_decompose(timeseries, model='additive', period=52)  # Weekly data, yearly seasonality
        
        plt.subplot(4, 1, 1)
        decomposition.observed.plot(title=f'{title} - Original')
        plt.subplot(4, 1, 2)
        decomposition.trend.plot(title='Trend')
        plt.subplot(4, 1, 3)
        decomposition.seasonal.plot(title='Seasonal')
        plt.subplot(4, 1, 4)
        decomposition.resid.plot(title='Residual')
        plt.tight_layout()
        plt.show()
        
        return decomposition
    

   
    def forecast_sales(self, store_dept_id, forecast_periods):
        if store_dept_id not in self.models:
            print(f"No model found for {store_dept_id}")
            return None
        
        model = self.models[store_dept_id]
        
        forecast = model.forecast(steps=forecast_periods)
        forecast_ci = model.get_forecast(steps=forecast_periods).conf_int()
        
        forecast = np.maximum(forecast, 0)
        
        return forecast, forecast_ci
    
    def generate_submission(self, submission_path='submission.csv'):
        print("Generating submission file...")
        
        test_combinations = self.test_data['Store_Dept'].unique()
        
        
        submission = self.test_data[['Store', 'Dept', 'Date']].copy()
        submission['Weekly_Sales'] = 0
        
        for store_dept in test_combinations:
            if store_dept in self.models:
                test_subset = self.test_data[self.test_data['Store_Dept'] == store_dept]
                forecast_periods = len(test_subset)
                
                forecast, _ = self.forecast_sales(store_dept, forecast_periods)
                
                if forecast is not None:
                    mask = submission['Store_Dept'] == store_dept
                    submission.loc[mask, 'Weekly_Sales'] = forecast
            else:
                store = int(store_dept.split('_')[0])
                dept = int(store_dept.split('_')[1])
                
                historical_mean = (self.train_data[
                    (self.train_data['Store'] == store) & 
                    (self.train_data['Dept'] == dept)
                ]['Weekly_Sales'].mean())
                
                if pd.isna(historical_mean):
                    historical_mean = self.train_data['Weekly_Sales'].mean()
                
                mask = (submission['Store'] == store) & (submission['Dept'] == dept)
                submission.loc[mask, 'Weekly_Sales'] = historical_mean
        
        submission['Store_Dept'] = submission['Store'].astype(str) + '_' + submission['Dept'].astype(str)
        
        submission[['Store', 'Dept', 'Date', 'Weekly_Sales']].to_csv(submission_path, index=False)
        print(f"Submission saved to {submission_path}")
        
        return submission
    
    def evaluate_model(self, test_size=0.2, top_n=50):

        top_combinations = (self.train_data.groupby('Store_Dept')['Weekly_Sales']
                          .sum()
                          .sort_values(ascending=False)
                          .head(top_n)
                          .index.tolist())


        print(f"combs - {top_combinations}")
        evaluations = {}
        for i, store_dept_id in enumerate(top_combinations):
            # print(f"\nTraining model {i+1}/{len(top_combinations)}: {store_dept_id}")

            if i % 10 is 0:
                print(f"checkpoint - {i+1}")
            full_data = self.train_data[self.train_data['Store_Dept'] == store_dept_id].copy()
            full_data = full_data.set_index('Date').sort_index()
            
            ts_data = full_data['Weekly_Sales']
            weights = full_data['Weight'] if 'Weight' in full_data.columns else pd.Series([1] * len(ts_data), index=ts_data.index)

            split_point = int(len(ts_data) * (1 - test_size))
            train_data = ts_data[:split_point]
            test_data = ts_data[split_point:]
            test_weights = weights[split_point:]

            model = SARIMAX(train_data,
                           order=(0, 1, 1),
                           seasonal_order=(0, 1, 1, 52),
                           enforce_stationarity=False,
                           enforce_invertibility=False)

            fitted_model = model.fit(disp=False, maxiter=30)
            forecast = fitted_model.forecast(steps=len(test_data))
            wmae = weighted_mean_absolute_error(test_data, forecast, test_weights)

            evaluations[store_dept_id] = {'forecast': forecast, 'actual': test_data, 'weights': test_weights}
            
            
        return evaluations   
    
    def calculate_overall_wmae(self, predictions_dict):
        all_actual = []
        all_predicted = []
        all_weights = []
        
        for store_dept, results in predictions_dict.items():
            if results is not None:
                all_actual.extend(results['actual'])
                all_predicted.extend(results['forecast'])
                all_weights.extend(results['weights'])
        
        if len(all_actual) == 0:
            return None
        
        overall_wmae = weighted_mean_absolute_error(all_actual, all_predicted, all_weights)
        print(f"Overall WMAE across all combinations: {overall_wmae:.2f}")
        
        return overall_wmae

In [None]:
if __name__ == "__main__":
    forecaster = WalmartSARIMAForecaster()
    
    train_data, test_data = forecaster.load_and_preprocess_data(
        train_path='/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip',
        test_path='/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip',
        stores_path='/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv',  
        features_path='/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip'  
    )
    
    

    evaluations = forecaster.evaluate_model(top_n=2500)
    overall_wmae = forecaster.calculate_overall_wmae(evaluations)
    
    print(f"\nSARIMA modeling complete!")
    print(f"Trained models: {len(forecaster.models)}")
    print(f"Overall WMAE: {overall_wmae:.2f}" if overall_wmae else "WMAE calculation failed")
    # print("Submission file generated: walmart_sarima_submission.csv")