In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [5]:
features = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip")
train = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip")
stores = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv")
test = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip")

In [None]:
!pip install darts
!pip install mlflow
!pip install dagshub

In [73]:
import dagshub
import mlflow
dagshub.init(repo_owner='dimna21', repo_name='ML_Final_Project', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=30f0c980-266f-47c9-aae1-79ed1f686cca&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=49db9bdb8f7808f7f6262904eba4aac6e5433bdd41fabf82245a867ec93cd91b




In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseMerger(BaseEstimator, TransformerMixin):
    
    def __init__(self, features, stores):
        self.feature_store = features.merge(stores, how='inner', on='Store')
        self.feature_store['Date'] = pd.to_datetime(self.feature_store['Date'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        merged = X.merge(self.feature_store, how='inner', on=['Store', 'Date', 'IsHoliday'])
        merged = merged.sort_values(by=['Date', 'Store', 'Dept']).reset_index(drop=True)
        return merged

In [15]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.superbowl = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        self.labor_day = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        self.thanksgiving = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        self.christmas = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert temperature to Celsius
        if 'Temperature' in X.columns:
            X['Temperature'] = (X['Temperature'] - 32) * (5.0 / 9.0)

        # Basic date parts
        X['Day'] = X['Date'].dt.day
        X['Month'] = X['Date'].dt.month
        X['Year'] = X['Date'].dt.year

        # Extract ISO week and year for holiday matching
        X['Week'] = X['Date'].dt.isocalendar().week
        X['YearNum'] = X['Date'].dt.year

        # Helper to flag if a date is in same ISO week/year as a known holiday
        def is_holiday_week(date_series, holidays):
            holiday_weeks = set((d.isocalendar().week, d.year) for d in holidays)
            return date_series.apply(lambda d: (d.isocalendar().week, d.year) in holiday_weeks if pd.notnull(d) else False).astype(int)

        X['SuperbowlWeek'] = is_holiday_week(X['Date'], self.superbowl)
        X['LaborDayWeek'] = is_holiday_week(X['Date'], self.labor_day)
        X['ThanksgivingWeek'] = is_holiday_week(X['Date'], self.thanksgiving)
        X['ChristmasWeek'] = is_holiday_week(X['Date'], self.christmas)

        # Calculate days to Thanksgiving and Christmas (using Nov 24 and Dec 24 as anchor dates)
        thanksgiving_dates = pd.to_datetime(X['Year'].astype(str) + "-11-24")
        christmas_dates = pd.to_datetime(X['Year'].astype(str) + "-12-24")

        X['Days_to_Thanksgiving'] = (thanksgiving_dates - X['Date']).dt.days
        X['Days_to_Christmas'] = (christmas_dates - X['Date']).dt.days

        # Clean up helper cols
        X = X.drop(columns=['Week', 'YearNum'])

        return X

In [16]:
class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        self.mean_cols = ['CPI', 'Unemployment']
        self.mean_values = {}

    def fit(self, X, y=None):
        for col in self.mean_cols:
            if col in X.columns:
                self.mean_values[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()

        # Fill markdowns with 0
        for col in self.markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0.0)

        # Fill CPI and Unemployment with learned mean
        for col in self.mean_cols:
            if col in X.columns and col in self.mean_values:
                X[col] = X[col].fillna(self.mean_values[col])

        return X

In [17]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_mapping = {'A': 3, 'B': 2, 'C': 1}
        self.holiday_mapping = {False: 0, True: 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Type' in X.columns:
            X['Type'] = X['Type'].map(self.type_mapping)

        if 'IsHoliday' in X.columns:
            X['IsHoliday'] = X['IsHoliday'].map(self.holiday_mapping)

        return X

In [18]:
class StoreAggregator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.timeseries = {}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.timeseries = {}
        for store in X['Store'].unique():
            self.aggregate_store_info(store, X)
        return self.timeseries
    
    def aggregate_store_info(self, store_id, X):
        store_data = X[X['Store'] == store_id].copy()
        
        # Check if Weekly_Sales exists (train data) or not (test data)
        has_weekly_sales = 'Weekly_Sales' in store_data.columns
        
        if has_weekly_sales:
            sum_columns = ['Weekly_Sales', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        else:
            sum_columns = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        
        first_columns = ['IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                        'Type', 'Size', 'Day', 'Month', 'Year', 'SuperbowlWeek',
                        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
                        'Days_to_Thanksgiving', 'Days_to_Christmas']
        
        agg_dict = {}
        
        # Add sum columns that exist in the data
        for col in sum_columns:
            if col in store_data.columns:
                agg_dict[col] = 'sum'
        
        # Add first columns that exist in the data
        for col in first_columns:
            if col in store_data.columns:
                agg_dict[col] = 'first'
        
        aggregated = store_data.groupby(['Date', 'Store']).agg(agg_dict).reset_index()
        aggregated = aggregated.sort_values('Date').reset_index(drop=True)
        
        # Calculate department proportions only if Weekly_Sales exists
        if has_weekly_sales:
            dept_proportions = self.calculate_dept_proportions(store_data)
        else:
            dept_proportions = None
        
        self.timeseries[store_id] = (aggregated, dept_proportions)
        return aggregated
    
    def calculate_dept_proportions(self, store_data):
        dept_totals = store_data.groupby('Dept')['Weekly_Sales'].sum()
        store_total = store_data['Weekly_Sales'].sum()
        
        if store_total == 0:
            num_depts = len(dept_totals)
            return {dept: 1.0/num_depts for dept in dept_totals.index}
        
        dept_proportions = (dept_totals / store_total).to_dict()
        return dept_proportions

In [19]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('feature_add', FeatureAdder()),
    ('value_fill', MissingValueFiller()),
    ('cat_encoder', CategoricalEncoder()),
    ('store_aggregator', StoreAggregator())
])

In [21]:
train_dict = pipeline.fit_transform(train)

In [74]:
from darts import TimeSeries
from darts.models.forecasting.arima import ARIMA

def train_store_models(p, d, q, train_weeks):
    mape_array = []
    store_models = {}
    
    for store in train_dict.keys():
        #extract store data
        df, prop = train_dict[store]
    
        #Make timeseries object
        full_ts = TimeSeries.from_dataframe(df,time_col='Date',value_cols=['Weekly_Sales'])
        train_ts = full_ts[:train_weeks]
        val_ts = full_ts[train_weeks:]
    
        #Train and validate model
        model = ARIMA(p=p,d=d,q=q)
        model.fit(train_ts)
        predictions = model.predict(len(val_ts))
        
        # Calculate MAPE
        actual_values = val_ts.values().flatten()
        pred_values = predictions.values().flatten()
        mask = actual_values != 0
        mape = np.mean(np.abs((actual_values[mask] - pred_values[mask]) / actual_values[mask])) * 100
        mape_array.append(mape)

        #Save store model
        store_models[store] = (model, prop)
        
    return (mape_array, store_models)

In [None]:
import dagshub
import mlflow
import pickle
import numpy as np
from datetime import datetime

# Set experiment name
mlflow.set_experiment("ARIMA_Parameter_Optimization")

# Start MLflow run for the entire parameter search
with mlflow.start_run(run_name=f"ARIMA_Grid_Search_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    
    # Log experiment configuration
    mlflow.log_param("model_type", "ARIMA")
    mlflow.log_param("validation_weeks", 123)
    mlflow.log_param("total_param_combinations", len(param_list))
    mlflow.log_param("param_list", str(param_list))
    
    from tqdm import tqdm
    param_list = [
        [1,1,0],
        [0,1,1], 
        [1,1,1],
        [2,1,1],
        [1,1,2],
        [2,1,2],
        [0,1,2],
        [3,1,1],
        [1,1,3],
        [4,1,1]
    ]
    
    max_mape = 100
    best_models = {}
    best_params = []
    
    for p in tqdm(param_list):
        # Create nested run for each parameter combination
        with mlflow.start_run(run_name=f"ARIMA({p[0]},{p[1]},{p[2]})", nested=True):
            
            # Log individual parameters
            mlflow.log_param("p", p[0])
            mlflow.log_param("d", p[1]) 
            mlflow.log_param("q", p[2])
            
            # Train models
            mape_array, store_models = train_store_models(p[0], p[1], p[2], 123)
            mean_mape = sum(mape_array)/len(mape_array)
            
            # Log metrics
            mlflow.log_metric("mean_mape", mean_mape)
            mlflow.log_metric("mape_std", np.std(mape_array))
            mlflow.log_metric("num_successful_stores", len(store_models))
            
            print(f'Storewise mean mape: {mean_mape} for params{p[0], p[1], p[2]}')
            
            # Check if this is the best model
            if mean_mape < max_mape:
                best_models = store_models
                max_mape = mean_mape
                best_params = [p[0], p[1], p[2]]
                
                # Log as best model so far
                mlflow.log_metric("is_best_model", 1)
            else:
                mlflow.log_metric("is_best_model", 0)
    
    # Log best model results
    mlflow.log_param("best_p", best_params[0])
    mlflow.log_param("best_d", best_params[1])
    mlflow.log_param("best_q", best_params[2])
    mlflow.log_metric("best_mean_mape", max_mape)
    
    # Save final best model
    with open("final_best_arima_models.pkl", "wb") as f:
        pickle.dump(best_models, f)
    mlflow.log_artifact("final_best_arima_models.pkl")

In [65]:
predictions = {}
train_end_date = pd.to_datetime(train['Date'].max())
for index, entry in tqdm(test.iterrows()):
    store = entry['Store']
    dept = entry['Dept']
    date = pd.to_datetime(entry['Date'])  # Convert to datetime
    pred_weeks = (date - train_end_date).days // 7

    model, sale_proportions = best_models[store]
    prediction = model.predict(pred_weeks).values()[-1, 0]
    
    if dept in sale_proportions.keys():
        prediction = prediction*sale_proportions[dept]
    else:
        prediction = prediction/len(sale_proportions)
        
    predictions[(store, dept, date)] = prediction

115064it [05:32, 346.04it/s]


In [66]:
submission_df = pd.DataFrame([
    {
        'Id': f"{store}_{dept}_{date.strftime('%Y-%m-%d')}",
        'Weekly_Sales': weekly_sales
    }
    for (store, dept, date), weekly_sales in predictions.items()
])

submission_df.to_csv('submission_arima.csv', index=False)