## PatchTST Model Experiment for Walmart Sales Forecasting
Final Project - ML Course

This script is written with clear Colab cell/markdown demarcations so that it can be
copied into a notebook and executed cell-by-cell.

### CELL 1: Setup and Installations

Install required packages

In [1]:
!pip install einops
!pip install pytorch-lightning
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install mlflow
!pip install dagshub
!pip install joblib
!pip install scikit-learn
!pip install pandas numpy matplotlib seaborn

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.2-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.7.4-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch-lightning)
  Downloadi

### CELL 2: Mount Google Drive

Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/ML-Final')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### CELL 3: Import Libraries

Import required libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# PyTorch and Deep Learning
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Sklearn utilities
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# MLflow and experiment tracking
import mlflow
import mlflow.pytorch
import dagshub

# System utilities
import sys
sys.path.append('.')

### CELL 4: Initialize MLflow and Load Data

Initialize MLflow experiment tracking

In [5]:
dagshub.init(repo_owner='egval20', repo_name='ML-Final', mlflow=True)
mlflow.set_experiment("PatchTST")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=87c719d9-0324-49b7-954a-e5ac8402552a&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=a47e86de90962382b8d936cec10e0cbb2d4afc9692f2269b869336fa13c8604b




<Experiment: artifact_location='mlflow-artifacts:/e98e1cc06f3a4654b9ba8710a9bb468d', creation_time=1751820253864, experiment_id='9', last_update_time=1751820253864, lifecycle_stage='active', name='PatchTST', tags={}>

### CELL 5: Load Preprocessing Pipeline and Data

Load preprocessing pipeline and data

In [6]:
from data_preprocessing_pipeline import *

def get_model_ready_data(pipeline_path='preprocessing_pipeline.pkl'):
    pipeline = joblib.load(pipeline_path)
    def preprocess_for_model(raw_data):
        return pipeline.transform(raw_data)
    return preprocess_for_model, pipeline

# Load preprocessing pipeline
preprocess_fn, loaded_pipeline = get_model_ready_data()

# Load raw data
print("Loading raw data...")
train_raw = pd.read_csv('data/train.csv')
test_raw = pd.read_csv('data/test.csv')
stores = pd.read_csv('data/stores.csv')
features = pd.read_csv('data/features.csv')

print(f"Train shape: {train_raw.shape}")
print(f"Test shape: {test_raw.shape}")
print(f"Date range - Train: {train_raw['Date'].min()} to {train_raw['Date'].max()}")
print(f"Date range - Test: {test_raw['Date'].min()} to {test_raw['Date'].max()}")

Loading raw data...
Train shape: (421570, 5)
Test shape: (115064, 4)
Date range - Train: 2010-02-05 to 2012-10-26
Date range - Test: 2012-11-02 to 2013-07-26


### CELL 6: Apply Preprocessing

Apply preprocessing pipeline

In [7]:
train_processed = preprocess_fn(train_raw)
test_processed = preprocess_fn(test_raw)

# Drop lag features (same list as ARIMA notebook)
lag_columns_to_drop = [
    'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_8', 'Sales_Lag_52',
    'Sales_MA_4', 'Sales_MA_8', 'Sales_MA_12',
    'Sales_STD_4', 'Sales_STD_8', 'Sales_STD_12'
]

def drop_lag_features(data, columns_to_drop):
    existing_cols = [col for col in columns_to_drop if col in data.columns]
    cleaned_data = data.drop(columns=existing_cols)
    print(f"Dropped {len(existing_cols)} lag/MA columns: {existing_cols}")
    return cleaned_data

train_processed_clean = drop_lag_features(train_processed, lag_columns_to_drop)
test_processed_clean = drop_lag_features(test_processed, lag_columns_to_drop)

# Log cleaning step
with mlflow.start_run(run_name="PatchTST_Cleaning"):
    mlflow.log_param("columns_to_drop_count", len(lag_columns_to_drop))
    mlflow.log_metric("original_train_rows", train_processed.shape[0])
    mlflow.log_metric("final_train_rows", train_processed_clean.shape[0])

print(f"Cleaned train shape: {train_processed_clean.shape}")

Dropped 12 lag/MA columns: ['Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_8', 'Sales_Lag_52', 'Sales_MA_4', 'Sales_MA_8', 'Sales_MA_12', 'Sales_STD_4', 'Sales_STD_8', 'Sales_STD_12']
Dropped 12 lag/MA columns: ['Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_8', 'Sales_Lag_52', 'Sales_MA_4', 'Sales_MA_8', 'Sales_MA_12', 'Sales_STD_4', 'Sales_STD_8', 'Sales_STD_12']
🏃 View run PatchTST_Cleaning at: https://dagshub.com/egval20/ML-Final.mlflow/#/experiments/9/runs/7b0f4b262b1c4075abdbca6103194cfd
🧪 View experiment at: https://dagshub.com/egval20/ML-Final.mlflow/#/experiments/9
Cleaned train shape: (421570, 52)


### CELL 7: Prepare Time Series Data for PatchTST

Prepare time series data for PatchTST

In [8]:
class TimeSeriesDataPreparation:
    def __init__(self, seq_len=52, pred_len=8, stride=1):
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.stride = stride
        self.scalers = {}

    def prepare_hierarchical_data(self, data, level='total'):
        data = data.copy()
        data['Date'] = pd.to_datetime(data['Date'])
        if level == 'total':
            agg_data = data.groupby('Date')['Weekly_Sales'].sum().reset_index().sort_values('Date')
            return agg_data

    def create_sequences(self, data, target_col='Weekly_Sales'):
        sequences, targets = [], []
        values = data[target_col].values
        for i in range(0, len(values) - self.seq_len - self.pred_len + 1, self.stride):
            seq = values[i:i + self.seq_len]
            target = values[i + self.seq_len:i + self.seq_len + self.pred_len]
            sequences.append(seq)
            targets.append(target)
        return np.array(sequences), np.array(targets)

data_prep = TimeSeriesDataPreparation(seq_len=52, pred_len=8, stride=1)

total_sales_df = data_prep.prepare_hierarchical_data(train_processed_clean, level='total')

# Train-validation split (80/20) based on date
validation_size = 0.2
split_idx = int(len(total_sales_df) * (1 - validation_size))
train_series_df = total_sales_df.iloc[:split_idx]
val_series_df = total_sales_df.iloc[split_idx - data_prep.seq_len:]  # keep overlap

# Scale the data (fit on training only)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_series_df['Scaled'] = scaler.fit_transform(train_series_df[['Weekly_Sales']])
val_series_df['Scaled'] = scaler.transform(val_series_df[['Weekly_Sales']])

train_seqs, train_tgts = data_prep.create_sequences(train_series_df, target_col='Scaled')
val_seqs, val_tgts = data_prep.create_sequences(val_series_df, target_col='Scaled')

print(f"Train sequences: {train_seqs.shape}")
print(f"Validation sequences: {val_seqs.shape}")

Train sequences: (55, 52)
Validation sequences: (22, 52)


### CELL 8: Define PatchTST Architecture

PatchTST Model Implementation (simplified)

In [9]:
class PatchTST(nn.Module):
    def __init__(self, seq_len, pred_len, n_features=1, patch_len=16, stride=16,
                 d_model=128, n_heads=8, num_layers=4, d_ff=256, dropout=0.1):
        super(PatchTST, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.n_features = n_features
        self.patch_len = patch_len
        self.stride = stride
        self.num_patches = ((seq_len - patch_len) // stride) + 1
        self.patch_embedding = nn.Linear(patch_len * n_features, d_model)
        self.position_embedding = nn.Parameter(torch.randn(1, self.num_patches, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads,
                                                    dim_feedforward=d_ff, dropout=dropout,
                                                    activation='gelu')
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(self.num_patches * d_model, pred_len * n_features)

    def forward(self, x):  # x: (batch, seq_len, n_features)
        b = x.size(0)
        # Create patches
        # (batch, n_features, seq_len)
        x = x.permute(0, 2, 1)
        patches = x.unfold(dimension=2, size=self.patch_len, step=self.stride)  # (b, n_feat, num_patches, patch_len)
        patches = patches.contiguous().view(b, self.n_features, self.num_patches, self.patch_len)
        patches = patches.permute(0, 2, 1, 3).contiguous().view(b, self.num_patches, -1)  # flatten
        tokens = self.patch_embedding(patches) + self.position_embedding  # (b, num_patches, d_model)
        tokens = tokens.permute(1, 0, 2)  # (num_patches, b, d_model)
        encoded = self.transformer(tokens)
        encoded = encoded.permute(1, 0, 2).contiguous().view(b, -1)
        out = self.fc(encoded).view(b, self.pred_len, self.n_features)
        return out

### CELL 9: Prepare PyTorch Datasets & Loaders

Build Dataset and DataLoader for training

In [10]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, targets):
        self.x = torch.from_numpy(sequences).float().unsqueeze(-1)  # (N, seq_len, 1)
        self.y = torch.from_numpy(targets).float().unsqueeze(-1)    # (N, pred_len, 1)
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(SequenceDataset(train_seqs, train_tgts), batch_size=64, shuffle=True)
val_loader = DataLoader(SequenceDataset(val_seqs, val_tgts), batch_size=64, shuffle=False)

### CELL 10: Train PatchTST Model

Training loop with MLflow tracking

In [11]:
with mlflow.start_run(run_name="PatchTST_Training"):
    # Hyperparameters
    seq_len = data_prep.seq_len
    pred_len = data_prep.pred_len
    model_params = {
        'patch_len': 16,
        'stride': 16,
        'd_model': 128,
        'n_heads': 8,
        'num_layers': 4,
        'd_ff': 256,
        'dropout': 0.1
    }

    # Log parameters
    mlflow.log_params(model_params)
    mlflow.log_param('seq_len', seq_len)
    mlflow.log_param('pred_len', pred_len)

    model = PatchTST(seq_len=seq_len, pred_len=pred_len, **model_params).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, verbose=True)

    best_val_loss = float('inf')
    epochs = 50

    for epoch in range(1, epochs + 1):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                loss = criterion(pred, yb)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)
        scheduler.step(val_loss)

        mlflow.log_metric('train_loss', train_loss, step=epoch)
        mlflow.log_metric('val_loss', val_loss, step=epoch)

        print(f"Epoch {epoch:02d}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'patchtst_best.pth')
            mlflow.log_artifact('patchtst_best.pth')

    print(f"Training complete. Best Val Loss: {best_val_loss:.4f}")
    mlflow.log_metric('best_val_loss', best_val_loss)

Epoch 01/50 | Train Loss: 1.2394 | Val Loss: 0.6575
Epoch 02/50 | Train Loss: 1.3869 | Val Loss: 0.2548
Epoch 03/50 | Train Loss: 0.9273 | Val Loss: 0.1544
Epoch 04/50 | Train Loss: 0.7795 | Val Loss: 0.2077
Epoch 05/50 | Train Loss: 0.7958 | Val Loss: 0.2379
Epoch 06/50 | Train Loss: 0.7432 | Val Loss: 0.1914
Epoch 07/50 | Train Loss: 0.6203 | Val Loss: 0.1375
Epoch 08/50 | Train Loss: 0.5035 | Val Loss: 0.1570
Epoch 09/50 | Train Loss: 0.4478 | Val Loss: 0.1972
Epoch 10/50 | Train Loss: 0.4187 | Val Loss: 0.2171
Epoch 11/50 | Train Loss: 0.3734 | Val Loss: 0.2008
Epoch 12/50 | Train Loss: 0.3223 | Val Loss: 0.1862
Epoch 13/50 | Train Loss: 0.2948 | Val Loss: 0.2057
Epoch 14/50 | Train Loss: 0.2630 | Val Loss: 0.2107
Epoch 15/50 | Train Loss: 0.2606 | Val Loss: 0.1944
Epoch 16/50 | Train Loss: 0.2412 | Val Loss: 0.1759
Epoch 17/50 | Train Loss: 0.2265 | Val Loss: 0.1563
Epoch 18/50 | Train Loss: 0.2138 | Val Loss: 0.1400
Epoch 19/50 | Train Loss: 0.1871 | Val Loss: 0.1358
Epoch 20/50 

### CELL 11: Hyperparameter Tuning

Simple grid search over a handful of PatchTST hyper-parameters to find a
better configuration.  Runs are deliberately kept short (epochs=30) so that
the search finishes in Colab.  All trials are logged to MLflow.

In [16]:
from itertools import product

param_grid = {
    'patch_len': [8, 16],
    'stride': [4, 8, 16],
    'd_model': [64, 128],
    'n_heads': [4, 8],
    'num_layers': [3, 4],
    'd_ff': [128, 256],
    'dropout': [0.1],
}

best_params = None
best_val_mae = float('inf')

with mlflow.start_run(run_name="PatchTST_Hyperparameter_Search"):
    trial_id = 0
    for patch_len, stride, d_model, n_heads, num_layers, d_ff, dropout in product(
        param_grid['patch_len'],
        param_grid['stride'],
        param_grid['d_model'],
        param_grid['n_heads'],
        param_grid['num_layers'],
        param_grid['d_ff'],
        param_grid['dropout']
    ):
        # Skip invalid combinations
        if stride > patch_len:
            continue

        trial_params = {
            'patch_len': patch_len,
            'stride': stride,
            'd_model': d_model,
            'n_heads': n_heads,
            'num_layers': num_layers,
            'd_ff': d_ff,
            'dropout': dropout,
        }

        trial_id += 1
        print(f"\nTrial {trial_id}: {trial_params}")
        mlflow.log_params({f"trial{trial_id}_{k}": v for k, v in trial_params.items()})

        # Build model
        model_tune = PatchTST(seq_len=data_prep.seq_len, pred_len=data_prep.pred_len, **trial_params).to(device)
        criterion = nn.L1Loss()  # MAE
        optimizer = torch.optim.Adam(model_tune.parameters(), lr=1e-3)

        # Short training
        epochs_search = 30
        for ep in range(epochs_search):
            model_tune.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                opt = torch.optim.Adam(model_tune.parameters(), lr=1e-3)
                opt.zero_grad()
                y_pred = model_tune(xb)
                loss = criterion(y_pred, yb)
                loss.backward()
                opt.step()

        # Validate
        model_tune.eval()
        val_preds, val_trues = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                pred = model_tune(xb).cpu()
                val_preds.append(pred)
                val_trues.append(yb)

        val_preds = torch.cat(val_preds, 0).numpy().flatten()
        val_trues = torch.cat(val_trues, 0).numpy().flatten()
        mae_val = mean_absolute_error(val_trues, val_preds)
        rmse_val = np.sqrt(mean_squared_error(val_trues, val_preds))

        print(f"MAE={mae_val:.4f}, RMSE={rmse_val:.4f}")
        mlflow.log_metric(f"trial{trial_id}_mae", mae_val)
        mlflow.log_metric(f"trial{trial_id}_rmse", rmse_val)

        if mae_val < best_val_mae:
            best_val_mae = mae_val
            best_params = trial_params
            mlflow.log_metric("best_val_mae", best_val_mae)

    # ✅ After all trials, log best params (safe logging)
    if best_params:
        safe_params = {f"best_{k}": str(v) for k, v in best_params.items()}
        mlflow.log_params(safe_params)

print(f"\nBest params after search: {best_params}, MAE={best_val_mae:.4f}")



Trial 1: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 4, 'num_layers': 3, 'd_ff': 128, 'dropout': 0.1}
MAE=0.4092, RMSE=0.4780

Trial 2: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 4, 'num_layers': 3, 'd_ff': 256, 'dropout': 0.1}
MAE=0.4054, RMSE=0.4729

Trial 3: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 4, 'num_layers': 4, 'd_ff': 128, 'dropout': 0.1}
MAE=0.3884, RMSE=0.4673

Trial 4: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 4, 'num_layers': 4, 'd_ff': 256, 'dropout': 0.1}
MAE=0.3974, RMSE=0.4787

Trial 5: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 8, 'num_layers': 3, 'd_ff': 128, 'dropout': 0.1}
MAE=0.3785, RMSE=0.4557

Trial 6: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 8, 'num_layers': 3, 'd_ff': 256, 'dropout': 0.1}
MAE=0.4006, RMSE=0.4910

Trial 7: {'patch_len': 8, 'stride': 4, 'd_model': 64, 'n_heads': 8, 'num_layers': 4, 'd_ff': 128, 'dropout': 0.1}
MAE=0.2825, RMSE=0.3510

Trial 8: {'patch_len': 8, 

### CELL 12: Train Final PatchTST on Full Data

Re-prepare the *full* training set (train + previous validation) and fit the
best-configuration PatchTST for longer.  We fit a fresh StandardScaler over
the entire series to avoid data leakage.

In [17]:
full_series_df = total_sales_df.copy()

final_scaler = StandardScaler()
full_series_df['Scaled'] = final_scaler.fit_transform(full_series_df[['Weekly_Sales']])

full_seqs, full_tgts = data_prep.create_sequences(full_series_df, target_col='Scaled')
full_loader = DataLoader(SequenceDataset(full_seqs, full_tgts), batch_size=64, shuffle=True)

final_epochs = 150
with mlflow.start_run(run_name="PatchTST_Final_Model"):
    mlflow.log_params(best_params)
    mlflow.log_param("epochs", final_epochs)

    final_model = PatchTST(seq_len=data_prep.seq_len, pred_len=data_prep.pred_len, **best_params).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(final_model.parameters(), lr=1e-3)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5, verbose=True)

    for ep in range(1, final_epochs + 1):
        final_model.train()
        losses = []
        for xb, yb in full_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = final_model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        epoch_loss = np.mean(losses)
        scheduler.step(epoch_loss)
        mlflow.log_metric('train_loss', epoch_loss, step=ep)
        if ep % 25 == 0 or ep == 1:
            print(f"Epoch {ep}/{final_epochs} - Loss: {epoch_loss:.4f}")

    torch.save(final_model.state_dict(), 'patchtst_final.pth')
    mlflow.log_artifact('patchtst_final.pth')
    joblib.dump(final_scaler, 'patchtst_final_scaler.pkl')
    mlflow.log_artifact('patchtst_final_scaler.pkl')

Epoch 1/150 - Loss: 1.1454
Epoch 25/150 - Loss: 0.1924
Epoch 50/150 - Loss: 0.0793
Epoch 75/150 - Loss: 0.0579
Epoch 100/150 - Loss: 0.0459
Epoch 125/150 - Loss: 0.0453
Epoch 150/150 - Loss: 0.0390
🏃 View run PatchTST_Final_Model at: https://dagshub.com/egval20/ML-Final.mlflow/#/experiments/9/runs/c2e38d6b3da147aca399b08ab06dbb7c
🧪 View experiment at: https://dagshub.com/egval20/ML-Final.mlflow/#/experiments/9


### CELL 13: Store-Level PatchTST Models (Top 5 Stores)

Train smaller PatchTST models for the top-5 stores to better capture local
patterns.

In [18]:
store_sales = train_processed_clean.groupby('Store')['Weekly_Sales'].sum().nlargest(5)

top_stores = store_sales.index.tolist()
print(f"Top 5 stores: {top_stores}")

store_models = {}
store_metrics = {}

with mlflow.start_run(run_name="PatchTST_Store_Models"):
    mlflow.log_param("top_stores", top_stores)
    for store in top_stores:
        print(f"\nTraining model for Store {store}")
        store_df = train_processed_clean[train_processed_clean['Store'] == store]
        store_series = data_prep.prepare_hierarchical_data(store_df, level='total')

        # Use its own scaler to avoid scale mismatch
        s_scaler = StandardScaler()
        store_series['Scaled'] = s_scaler.fit_transform(store_series[['Weekly_Sales']])

        s_seqs, s_tgts = data_prep.create_sequences(store_series, target_col='Scaled')
        if len(s_seqs) < 10:
            print("  Skipping – not enough data after sequencing.")
            continue
        s_train_len = int(len(s_seqs) * 0.8)
        s_train_ds = SequenceDataset(s_seqs[:s_train_len], s_tgts[:s_train_len])
        s_val_ds   = SequenceDataset(s_seqs[s_train_len:], s_tgts[s_train_len:])
        s_train_loader = DataLoader(s_train_ds, batch_size=32, shuffle=True)
        s_val_loader   = DataLoader(s_val_ds, batch_size=32, shuffle=False)

        s_model = PatchTST(seq_len=data_prep.seq_len, pred_len=data_prep.pred_len, **best_params).to(device)
        crit = nn.L1Loss()
        opt  = torch.optim.Adam(s_model.parameters(), lr=5e-4)

        best_s_val_mae = float('inf')
        for ep in range(1, 60):
            s_model.train()
            for xb, yb in s_train_loader:
                xb, yb = xb.to(device), yb.to(device)
                opt.zero_grad()
                pred = s_model(xb)
                loss = crit(pred, yb)
                loss.backward()
                opt.step()

            # quick val
            s_model.eval()
            preds, trues = [], []
            with torch.no_grad():
                for xb, yb in s_val_loader:
                    xb = xb.to(device)
                    p = s_model(xb).cpu()
                    preds.append(p)
                    trues.append(yb)
            preds = torch.cat(preds, 0).numpy().flatten()
            trues = torch.cat(trues, 0).numpy().flatten()
            mae_s = mean_absolute_error(trues, preds)
            if mae_s < best_s_val_mae:
                best_s_val_mae = mae_s
                torch.save(s_model.state_dict(), f'patchtst_store_{store}.pth')
            if ep % 20 == 0:
                print(f"  Store {store} Epoch {ep} MAE={mae_s:.4f}")

        mlflow.log_metric(f"store_{store}_best_mae", best_s_val_mae)
        store_models[store] = {
            'model_path': f'patchtst_store_{store}.pth',
            'scaler': s_scaler
        }

joblib.dump(store_models, 'patchtst_store_models.pkl')
mlflow.log_artifact('patchtst_store_models.pkl')



Top 5 stores: [20, 4, 14, 13, 2]

Training model for Store 20
  Store 20 Epoch 20 MAE=0.3386
  Store 20 Epoch 40 MAE=0.3184

Training model for Store 4
  Store 4 Epoch 20 MAE=0.2896
  Store 4 Epoch 40 MAE=0.2344

Training model for Store 14
  Store 14 Epoch 20 MAE=0.9341
  Store 14 Epoch 40 MAE=0.8775

Training model for Store 13
  Store 13 Epoch 20 MAE=0.3031
  Store 13 Epoch 40 MAE=0.2526

Training model for Store 2
  Store 2 Epoch 20 MAE=0.3539
  Store 2 Epoch 40 MAE=0.3882
🏃 View run PatchTST_Store_Models at: https://dagshub.com/egval20/ML-Final.mlflow/#/experiments/9/runs/7710f4fc7c674578b489f273447506be
🧪 View experiment at: https://dagshub.com/egval20/ML-Final.mlflow/#/experiments/9


### CELL 14: Final Forecast for Kaggle Submission (using Final Model)

Produce predictions for the public test set using the final aggregated model.

In [19]:
# Reload final model & scaler
final_model_loaded = PatchTST(seq_len=data_prep.seq_len, pred_len=data_prep.pred_len, **best_params).to(device)
final_model_loaded.load_state_dict(torch.load('patchtst_final.pth', map_location=device))
final_model_loaded.eval()

# Prepare full (train + test) series scaled by final_scaler
full_series_all = pd.concat([train_processed_clean, test_processed_clean])
full_total_series = data_prep.prepare_hierarchical_data(full_series_all, level='total')
full_total_series['Scaled'] = final_scaler.transform(full_total_series[['Weekly_Sales']])

test_start_idx = len(total_sales_df)
num_test_weeks = len(test_raw['Date'].unique())

init_input = full_total_series['Scaled'].values[test_start_idx - data_prep.seq_len:test_start_idx]
cur_inp = torch.from_numpy(init_input).float().view(1, -1, 1).to(device)

scaled_preds = []
with torch.no_grad():
    for i in range(0, num_test_weeks, data_prep.pred_len):
        pred = final_model_loaded(cur_inp)
        pred_np = pred.cpu().numpy().flatten()
        scaled_preds.extend(pred_np)
        # slide window
        cur_inp = torch.from_numpy(
            np.concatenate([cur_inp.cpu().numpy().flatten()[data_prep.pred_len:], pred_np])
        ).float().view(1, -1, 1).to(device)

scaled_preds = scaled_preds[:num_test_weeks]
final_preds = final_scaler.inverse_transform(np.array(scaled_preds).reshape(-1, 1)).flatten()

# Distribute to store / dept (same simple proportion method)
hist_props = train_processed_clean.groupby(['Store', 'Dept'])['Weekly_Sales'].sum()
hist_props = hist_props / hist_props.sum()

submission_rows = []
unique_dates = list(test_raw.sort_values('Date')['Date'].unique())
for idx, row in test_raw.iterrows():
    store, dept, date = row['Store'], row['Dept'], row['Date']
    week_idx = unique_dates.index(date)
    prop = hist_props.get((store, dept), 1.0 / len(hist_props))
    submission_rows.append({
        'Id': f"{store}_{dept}_{date}",
        'Weekly_Sales': max(0, final_preds[week_idx] * prop)
    })

submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('patchtst_submission.csv', index=False)
print(f"Submission file saved – shape: {submission_df.shape}")
mlflow.log_artifact('patchtst_submission.csv')

Submission file saved – shape: (115064, 2)


### CELL 15: Comprehensive Summary

In [20]:
print("\n" + "="*60)
print("PATCHTST IMPLEMENTATION COMPLETE")
print("="*60)
print(f"\nBest Validation MAE after search: {best_val_mae:.4f}")
print(f"Final training epochs: {final_epochs}")
print(f"Top-5 store models trained: {len(store_models)}")
print("Submission file: patchtst_submission.csv")
print("All experiments, models & artifacts logged to MLflow")
print("="*60)


PATCHTST IMPLEMENTATION COMPLETE

Best Validation MAE after search: 0.2262
Final training epochs: 150
Top-5 store models trained: 5
Submission file: patchtst_submission.csv
All experiments, models & artifacts logged to MLflow


Use the trained model to predict on the test period

In [21]:
# Reload best model
best_model = PatchTST(seq_len=data_prep.seq_len, pred_len=data_prep.pred_len, **model_params).to(device)
best_model.load_state_dict(torch.load('patchtst_best.pth', map_location=device))
best_model.eval()

# Create full series (train + test) for iterative forecasting
full_series_df = pd.concat([train_processed_clean, test_processed_clean])
full_sales_df = data_prep.prepare_hierarchical_data(full_series_df, level='total')
full_sales_df['Scaled'] = scaler.transform(full_sales_df[['Weekly_Sales']])

# Dates
train_dates = total_sales_df['Date']
full_dates = full_sales_df['Date']

test_start_idx = len(total_sales_df)

test_weeks = len(test_raw['Date'].unique())

# Prepare initial input (last seq_len from training)
input_seq = full_sales_df['Scaled'].values[test_start_idx - data_prep.seq_len:test_start_idx]
current_input = torch.from_numpy(input_seq).float().view(1, -1, 1).to(device)

predictions_scaled = []
with torch.no_grad():
    for i in range(0, test_weeks, data_prep.pred_len):
        pred = best_model(current_input)
        pred_np = pred.cpu().numpy().flatten()
        predictions_scaled.extend(pred_np)

        # Update the input sequence with new predictions
        new_input = np.concatenate([
            current_input.cpu().numpy().flatten()[data_prep.pred_len:],
            pred_np
        ])
        current_input = torch.from_numpy(new_input).float().view(1, -1, 1).to(device)

# Trim to exact number of test weeks
predictions_scaled = predictions_scaled[:test_weeks]

# Inverse transform predictions
predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1)).flatten()

# Distribute predictions to store-department combinations using historical proportions
historical_props = train_processed_clean.groupby(['Store', 'Dept'])['Weekly_Sales'].sum()
historical_props = historical_props / historical_props.sum()

submission_rows = []
test_raw_sorted = test_raw.sort_values('Date')  # Ensure alignment
for idx, row in test_raw_sorted.iterrows():
    store, dept, date = row['Store'], row['Dept'], row['Date']
    week_idx = list(test_raw_sorted['Date'].unique()).index(date)  # 0-based index
    prop = historical_props.get((store, dept), 1.0 / len(test_raw_sorted))
    pred_sales = max(0, predictions[week_idx] * prop)
    submission_rows.append({'Id': f"{store}_{dept}_{date}", 'Weekly_Sales': pred_sales})

submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('patchtst_submission.csv', index=False)
print(f"Submission saved! Shape: {submission_df.shape}")

# Log submission
mlflow.log_artifact('patchtst_submission.csv')

Submission saved! Shape: (115064, 2)


### CELL 12: Summary & Next Steps

Summary of PatchTST experiment

In [22]:
print("\n" + "="*60)
print("PATCHTST MODEL TRAINING SUMMARY")
print("="*60)
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"Total Train Sequences: {train_seqs.shape[0]}")
print(f"Total Validation Sequences: {val_seqs.shape[0]}")
print("Submission file: patchtst_submission.csv")
print("All artifacts & metrics logged to MLflow")
print("="*60)


PATCHTST MODEL TRAINING SUMMARY
Best Validation Loss: 0.1186
Total Train Sequences: 55
Total Validation Sequences: 22
Submission file: patchtst_submission.csv
All artifacts & metrics logged to MLflow
