In [2]:
import pandas as pd
features = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/features.csv")
train = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/train.csv")
stores = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/stores.csv")
test = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/test.csv")

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseMerger(BaseEstimator, TransformerMixin):

    def __init__(self, features, stores):
        self.feature_store = features.merge(stores, how='inner', on='Store')
        self.feature_store['Date'] = pd.to_datetime(self.feature_store['Date'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        merged = X.merge(self.feature_store, how='inner', on=['Store', 'Date', 'IsHoliday'])
        merged = merged.sort_values(by=['Date', 'Store', 'Dept']).reset_index(drop=True)
        return merged

class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.superbowl = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        self.labor_day = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        self.thanksgiving = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        self.christmas = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert temperature to Celsius
        if 'Temperature' in X.columns:
            X['Temperature'] = (X['Temperature'] - 32) * (5.0 / 9.0)

        # Basic date parts
        X['Day'] = X['Date'].dt.day
        X['Month'] = X['Date'].dt.month
        X['Year'] = X['Date'].dt.year

        # Extract ISO week and year for holiday matching
        X['Week'] = X['Date'].dt.isocalendar().week
        X['YearNum'] = X['Date'].dt.year

        # Helper to flag if a date is in same ISO week/year as a known holiday
        def is_holiday_week(date_series, holidays):
            holiday_weeks = set((d.isocalendar().week, d.year) for d in holidays)
            return date_series.apply(lambda d: (d.isocalendar().week, d.year) in holiday_weeks if pd.notnull(d) else False).astype(int)

        X['SuperbowlWeek'] = is_holiday_week(X['Date'], self.superbowl)
        X['LaborDayWeek'] = is_holiday_week(X['Date'], self.labor_day)
        X['ThanksgivingWeek'] = is_holiday_week(X['Date'], self.thanksgiving)
        X['ChristmasWeek'] = is_holiday_week(X['Date'], self.christmas)

        # Calculate days to Thanksgiving and Christmas (using Nov 24 and Dec 24 as anchor dates)
        thanksgiving_dates = pd.to_datetime(X['Year'].astype(str) + "-11-24")
        christmas_dates = pd.to_datetime(X['Year'].astype(str) + "-12-24")

        X['Days_to_Thanksgiving'] = (thanksgiving_dates - X['Date']).dt.days
        X['Days_to_Christmas'] = (christmas_dates - X['Date']).dt.days

        # Clean up helper cols
        X = X.drop(columns=['Week', 'YearNum'])

        return X

class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        self.mean_cols = ['CPI', 'Unemployment']
        self.mean_values = {}

    def fit(self, X, y=None):
        for col in self.mean_cols:
            if col in X.columns:
                self.mean_values[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()

        # Fill markdowns with 0
        for col in self.markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0.0)

        # Fill CPI and Unemployment with learned mean
        for col in self.mean_cols:
            if col in X.columns and col in self.mean_values:
                X[col] = X[col].fillna(self.mean_values[col])

        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_mapping = {'A': 3, 'B': 2, 'C': 1}
        self.holiday_mapping = {False: 0, True: 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Type' in X.columns:
            X['Type'] = X['Type'].map(self.type_mapping)

        if 'IsHoliday' in X.columns:
            X['IsHoliday'] = X['IsHoliday'].map(self.holiday_mapping)

        return X

class StoreAggregator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.timeseries = {}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.timeseries = {}
        for store in X['Store'].unique():
            self.aggregate_store_info(store, X)
        return self.timeseries

    def aggregate_store_info(self, store_id, X):
        store_data = X[X['Store'] == store_id].copy()

        # Check if Weekly_Sales exists (train data) or not (test data)
        has_weekly_sales = 'Weekly_Sales' in store_data.columns

        if has_weekly_sales:
            sum_columns = ['Weekly_Sales', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        else:
            sum_columns = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

        first_columns = ['IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                        'Type', 'Size', 'Day', 'Month', 'Year', 'SuperbowlWeek',
                        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
                        'Days_to_Thanksgiving', 'Days_to_Christmas']

        agg_dict = {}

        # Add sum columns that exist in the data
        for col in sum_columns:
            if col in store_data.columns:
                agg_dict[col] = 'sum'

        # Add first columns that exist in the data
        for col in first_columns:
            if col in store_data.columns:
                agg_dict[col] = 'first'

        aggregated = store_data.groupby(['Date', 'Store']).agg(agg_dict).reset_index()
        aggregated = aggregated.sort_values('Date').reset_index(drop=True)

        # Calculate department proportions only if Weekly_Sales exists
        if has_weekly_sales:
            dept_proportions = self.calculate_dept_proportions(store_data)
        else:
            dept_proportions = None

        self.timeseries[store_id] = (aggregated, dept_proportions)
        return aggregated

    def calculate_dept_proportions(self, store_data):
        dept_totals = store_data.groupby('Dept')['Weekly_Sales'].sum()
        store_total = store_data['Weekly_Sales'].sum()

        if store_total == 0:
            num_depts = len(dept_totals)
            return {dept: 1.0/num_depts for dept in dept_totals.index}

        dept_proportions = (dept_totals / store_total).to_dict()
        return dept_proportions

In [4]:
from sklearn.pipeline import Pipeline

# Pipeline with store aggregation
pipeline = Pipeline([
   ('merge', BaseMerger(features, stores)),
   ('feature_add', FeatureAdder()),
   ('value_fill', MissingValueFiller()),
   ('cat_encoder', CategoricalEncoder()),
   ('store_agg', StoreAggregator())
])

# Transform training data
train_aggregated = pipeline.fit_transform(train)
test_aggregated = pipeline.transform(test)




In [None]:
!pip install dagshub
!pip install mlflow

In [5]:
import dagshub
import mlflow
dagshub.init(repo_owner='dimna21', repo_name='ML_Final_Project', mlflow=True)

In [7]:
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tqdm import tqdm
ls = ['IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                        'Type', 'Size', 'SuperbowlWeek',
                        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
                        'Days_to_Thanksgiving', 'Days_to_Christmas', 'MarkDown1',
                        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
store_models = {}
for store_id in tqdm(train_aggregated.keys()):
   store_ts, dept_props = train_aggregated[store_id]

   sales = store_ts['Weekly_Sales'].values

   exog = store_ts[ls].values

   model = SARIMAX(
       endog=sales,
       exog=exog,
       order=(2, 1, 1),
       seasonal_order=(1, 1, 1, 52),
       enforce_stationarity=False,
       enforce_invertibility=False
   )

   fitted_model = model.fit(disp=False, maxiter=80, method='lbfgs')
   store_models[store_id] = (fitted_model, dept_props)

100%|██████████| 45/45 [1:13:40<00:00, 98.23s/it] 


In [12]:
import mlflow
import pickle
import os

mlflow.set_experiment("SARIMAX_Store_Models (2,1,1)")
run = mlflow.start_run(run_name="SARIMAX_Model_Training(2,1,1)")

# Log training features
feature_list = ['IsHoliday', 'Type', 'Size', 'SuperbowlWeek',
                'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek']
mlflow.log_param("features_used", ", ".join(feature_list))

# Log SARIMAX model parameters
mlflow.log_param("order", "(2,1,1)")
mlflow.log_param("seasonal_order", "(1,1,1,52)")
mlflow.log_param("enforce_stationarity", False)
mlflow.log_param("enforce_invertibility", False)
mlflow.log_param("maxiter", "80")
mlflow.log_param("method", "lbfgs")


# Finish MLflow run
mlflow.end_run()

2025/07/27 21:59:12 INFO mlflow.tracking.fluent: Experiment with name 'SARIMAX_Store_Models (2,1,1)' does not exist. Creating a new experiment.


🏃 View run SARIMAX_Model_Training(2,1,1) at: https://dagshub.com/dimna21/ML_Final_Project.mlflow/#/experiments/8/runs/051e53d25b7447bbab4897b2091eff31
🧪 View experiment at: https://dagshub.com/dimna21/ML_Final_Project.mlflow/#/experiments/8


In [10]:
max_train_date = pd.to_datetime(train['Date'].max())

def get_forecast(idx):
    sample = test.iloc[idx]
    store = sample['Store']
    dept = sample['Dept']
    pred_date = pd.to_datetime(sample['Date'])
    weeks = (pred_date-max_train_date).days//7
    store_model, dept_props = store_models[store]
    df, _ = test_aggregated[store]
    exog = df[ls].values[:weeks]
    forecast = store_model.forecast(steps=weeks, exog=exog)[-1]

    if dept in dept_props.keys():
        return forecast*dept_props[dept]
    else:
        return forecast/len(dept_props)



In [11]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

# Function to process a single row
def process_row(idx):
    sample = test.iloc[idx]
    store = sample['Store']
    dept = sample['Dept']
    date = pd.to_datetime(sample['Date'])

    prediction = get_forecast(idx)

    return {
        'Id': f"{store}_{dept}_{date.strftime('%Y-%m-%d')}",
        'Weekly_Sales': max(0, prediction)
    }

# Run in parallel with progress bar
submission_data = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_row, idx) for idx in range(len(test))]
    for future in tqdm(as_completed(futures), total=len(futures)):
        submission_data.append(future.result())

# Save the results
submission_df = pd.DataFrame(submission_data)
submission_df.to_csv("/content/drive/MyDrive/ML_Final_Project/SARIMAX_Submission2.csv", index=False)
print(f"Submission created with {len(submission_df)} predictions")


100%|██████████| 115064/115064 [30:11<00:00, 63.51it/s]


Submission created with 115064 predictions
