<a href="https://colab.research.google.com/github/ekvirika/WalmartRecruiting/blob/main/notebooks/model_experiment_patchtst.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Install required packages
!pip install wandb torch torchvision pandas numpy matplotlib seaborn scikit-learn mlflow

# Set up Kaggle API
!pip install kaggle



In [18]:
# Upload your kaggle.json to Colab and run:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [19]:
# Download the dataset
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip -q walmart-recruiting-store-sales-forecasting.zip

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace sampleSubmission.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace stores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace test.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace train.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y


In [20]:
!unzip -q train.csv.zip
!unzip -q stores.csv.zip
!unzip -q test.csv.zip
!unzip -q features.csv.zip

replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
unzip:  cannot find or open stores.csv.zip, stores.csv.zip.zip or stores.csv.zip.ZIP.
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
replace features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y


# 1. Setup and Imports

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import mlflow
import mlflow.pytorch
import mlflow.sklearn
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# MLflow setup
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("PatchTST_Training")

print("🚀 Setup completed successfully!")


🚀 Setup completed successfully!


# 2. Data Loading

In [22]:
def load_walmart_data():
    """Load Walmart sales data with proper data types"""
    print("📊 Loading Walmart datasets...")

    # Load datasets
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    features_df = pd.read_csv('features.csv')
    stores_df = pd.read_csv('stores.csv')

    # Convert date columns
    train_df['Date'] = pd.to_datetime(train_df['Date'])
    test_df['Date'] = pd.to_datetime(test_df['Date'])
    features_df['Date'] = pd.to_datetime(features_df['Date'])

    print(f"✅ Data loaded successfully!")
    print(f"📈 Train shape: {train_df.shape}")
    print(f"🔮 Test shape: {test_df.shape}")
    print(f"🏪 Stores shape: {stores_df.shape}")
    print(f"📋 Features shape: {features_df.shape}")

    return train_df, test_df, features_df, stores_df

# Load data
train_df, test_df, features_df, stores_df = load_walmart_data()


📊 Loading Walmart datasets...
✅ Data loaded successfully!
📈 Train shape: (421570, 5)
🔮 Test shape: (115064, 4)
🏪 Stores shape: (45, 3)
📋 Features shape: (8190, 12)


# 3. Data Cleaning Pipeline

In [28]:
class DataCleaner(BaseEstimator, TransformerMixin):
    """Data cleaning transformer for Walmart sales data"""

    def __init__(self):
        self.outlier_bounds = {}
        self.fill_values = {}

    def fit(self, X, y=None):
        """Fit the data cleaner"""
        X_clean = X.copy()

        # Calculate outlier bounds for Weekly_Sales (if present)
        if 'Weekly_Sales' in X_clean.columns:
            Q1 = X_clean['Weekly_Sales'].quantile(0.25)
            Q3 = X_clean['Weekly_Sales'].quantile(0.75)
            IQR = Q3 - Q1
            self.outlier_bounds['Weekly_Sales'] = {
                'lower': Q1 - 1.5 * IQR,
                'upper': Q3 + 1.5 * IQR
            }

        # Calculate fill values for missing data
        numeric_columns = X_clean.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            if col in ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
                self.fill_values[col] = 0.0  # Markdowns are 0 when not present
            else:
                self.fill_values[col] = X_clean[col].median()

        return self

    def transform(self, X):
        """Transform the data"""
        X_clean = X.copy()

        # Handle missing values
        for col, fill_value in self.fill_values.items():
            if col in X_clean.columns:
                X_clean[col] = X_clean[col].fillna(fill_value)

        # Handle outliers in Weekly_Sales (only for training data)
        if 'Weekly_Sales' in X_clean.columns and 'Weekly_Sales' in self.outlier_bounds:
            bounds = self.outlier_bounds['Weekly_Sales']
            # Cap outliers instead of removing them
            X_clean['Weekly_Sales'] = X_clean['Weekly_Sales'].clip(
                lower=bounds['lower'], upper=bounds['upper']
            )

        # Handle negative sales (set to 0)
        if 'Weekly_Sales' in X_clean.columns:
            X_clean['Weekly_Sales'] = X_clean['Weekly_Sales'].clip(lower=0)

        return X_clean

# Run data cleaning experiment
with mlflow.start_run(run_name="PatchTST_Cleaning"):
    print("🧹 Starting data cleaning process...")

    # Log parameters
    mlflow.log_param("cleaning_method", "IQR_outlier_detection")
    mlflow.log_param("missing_value_strategy", "median_fill")

    # Initialize cleaner
    cleaner = DataCleaner()

    # Drop 'IsHoliday' from features_df before merging
    features_df = features_df.drop(columns=['IsHoliday'])

    # Safe merge
    train_merged = train_df.merge(stores_df, on='Store', how='left')
    train_merged = train_merged.merge(features_df, on=['Store', 'Date'], how='left')


    # Fit and transform training data
    train_cleaned = cleaner.fit_transform(train_merged)

    # Log cleaning statistics
    original_sales_std = train_df['Weekly_Sales'].std()
    cleaned_sales_std = train_cleaned['Weekly_Sales'].std()

    mlflow.log_metric("original_sales_std", original_sales_std)
    mlflow.log_metric("cleaned_sales_std", cleaned_sales_std)
    mlflow.log_metric("outlier_reduction", (original_sales_std - cleaned_sales_std) / original_sales_std)

    # Log missing value statistics
    missing_before = train_merged.isnull().sum().sum()
    missing_after = train_cleaned.isnull().sum().sum()

    mlflow.log_metric("missing_values_before", missing_before)
    mlflow.log_metric("missing_values_after", missing_after)

    print(f"✅ Data cleaning completed!")
    print(f"📊 Missing values reduced from {missing_before} to {missing_after}")
    print(f"📈 Sales volatility reduced by {((original_sales_std - cleaned_sales_std) / original_sales_std) * 100:.2f}%")


🧹 Starting data cleaning process...
✅ Data cleaning completed!
📊 Missing values reduced from 1422431 to 0
📈 Sales volatility reduced by 34.36%


In [29]:
print(train_cleaned.columns)

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Type', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment'],
      dtype='object')


# 4. Feature Engineering Pipeline

In [30]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Comprehensive feature engineering for time series forecasting"""

    def __init__(self, lag_periods=[1, 2, 3, 4, 5, 8, 12, 52],
                 rolling_windows=[3, 4, 8, 12, 26, 52]):
        self.lag_periods = lag_periods
        self.rolling_windows = rolling_windows
        self.label_encoders = {}
        self.fitted = False

    def fit(self, X, y=None):
        """Fit the feature engineer"""
        X_features = X.copy()

        # Fit label encoders
        for col in ['Type']:
            if col in X_features.columns:
                self.label_encoders[col] = LabelEncoder()
                self.label_encoders[col].fit(X_features[col].astype(str))

        self.fitted = True
        return self

    def transform(self, X):
        """Transform data with comprehensive feature engineering"""
        X_features = X.copy()

        # 1. Time-based features
        X_features = self._create_time_features(X_features)

        # 2. Lag features (only for training data with Weekly_Sales)
        if 'Weekly_Sales' in X_features.columns:
            X_features = self._create_lag_features(X_features)
            X_features = self._create_rolling_features(X_features)

        # 3. Store and department features
        X_features = self._create_store_features(X_features)

        # 4. Economic and promotional features
        X_features = self._create_economic_features(X_features)

        # 5. Holiday and seasonal features
        X_features = self._create_holiday_features(X_features)

        # 6. Interaction features
        X_features = self._create_interaction_features(X_features)

        # 7. Encode categorical variables
        X_features = self._encode_categorical(X_features)

        # Remove rows with too many NaN values (mainly from lag features)
        if 'Weekly_Sales' in X_features.columns:
            # For training data, we can afford to lose some rows
            X_features = X_features.dropna(subset=['Weekly_Sales'])
            # Fill remaining NaN values with forward fill then backward fill
            X_features = X_features.fillna(method='ffill').fillna(method='bfill')
        else:
            # For test data, we need to keep all rows
            X_features = X_features.fillna(method='ffill').fillna(method='bfill')
            # Fill any remaining NaN with median
            numeric_cols = X_features.select_dtypes(include=[np.number]).columns
            X_features[numeric_cols] = X_features[numeric_cols].fillna(X_features[numeric_cols].median())

        return X_features

    def _create_time_features(self, df):
        """Create time-based features"""
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Week'] = df['Date'].dt.isocalendar().week
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['Quarter'] = df['Date'].dt.quarter
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        df['DayOfYear'] = df['Date'].dt.dayofyear

        # Cyclical features
        df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
        df['Week_sin'] = np.sin(2 * np.pi * df['Week'] / 52)
        df['Week_cos'] = np.cos(2 * np.pi * df['Week'] / 52)
        df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
        df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)
        df['Quarter_sin'] = np.sin(2 * np.pi * df['Quarter'] / 4)
        df['Quarter_cos'] = np.cos(2 * np.pi * df['Quarter'] / 4)

        return df

    def _create_lag_features(self, df):
        """Create lag features for time series"""
        df = df.sort_values(['Store', 'Dept', 'Date'])

        for lag in self.lag_periods:
            df[f'Weekly_Sales_lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)

        return df

    def _create_rolling_features(self, df):
        """Create rolling statistical features"""
        df = df.sort_values(['Store', 'Dept', 'Date'])

        for window in self.rolling_windows:
          # Rolling statistics using transform to preserve index alignment
          df[f'Weekly_Sales_rolling_mean_{window}'] = (
              df.groupby(['Store', 'Dept'])['Weekly_Sales']
              .transform(lambda x: x.rolling(window=window, min_periods=1).mean())
          )

          df[f'Weekly_Sales_rolling_std_{window}'] = (
              df.groupby(['Store', 'Dept'])['Weekly_Sales']
              .transform(lambda x: x.rolling(window=window, min_periods=1).std())
          )

          df[f'Weekly_Sales_rolling_min_{window}'] = (
              df.groupby(['Store', 'Dept'])['Weekly_Sales']
              .transform(lambda x: x.rolling(window=window, min_periods=1).min())
          )

          df[f'Weekly_Sales_rolling_max_{window}'] = (
              df.groupby(['Store', 'Dept'])['Weekly_Sales']
              .transform(lambda x: x.rolling(window=window, min_periods=1).max())
          )

        if 'Temperature' in df.columns:
          df[f'Temperature_rolling_mean_{window}'] = (
              df.groupby(['Store'])['Temperature']
              .transform(lambda x: x.rolling(window=window, min_periods=1).mean())
          )

        return df

    def _create_store_features(self, df):
        """Create store-specific features"""
        if 'Size' in df.columns:
            # Store size categories
            df['Size_category'] = pd.cut(df['Size'], bins=3, labels=['Small', 'Medium', 'Large'])

            # Store size relative to average
            df['Size_relative'] = df['Size'] / df['Size'].mean()

        return df

    def _create_economic_features(self, df):
        """Create economic and promotional features"""
        # Total markdowns
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        available_markdowns = [col for col in markdown_cols if col in df.columns]

        if available_markdowns:
            df['Total_MarkDown'] = df[available_markdowns].sum(axis=1)
            df['Has_MarkDown'] = (df['Total_MarkDown'] > 0).astype(int)
            df['MarkDown_Count'] = (df[available_markdowns] > 0).sum(axis=1)

        # Economic indicators
        if 'CPI' in df.columns and 'Unemployment' in df.columns:
            df['Economic_Index'] = df['CPI'] / (df['Unemployment'] + 1)  # Add 1 to avoid division by zero

        # Fuel price impact
        if 'Fuel_Price' in df.columns:
            df['Fuel_Price_High'] = (df['Fuel_Price'] > df['Fuel_Price'].quantile(0.75)).astype(int)

        return df

    def _create_holiday_features(self, df):
        """Create holiday and seasonal features"""
        # Holiday impact
        df['IsHoliday_int'] = df['IsHoliday'].astype(int)

        # Seasonal patterns
        df['Is_BackToSchool'] = ((df['Month'] == 8) | (df['Month'] == 9)).astype(int)
        df['Is_Christmas'] = (df['Month'] == 12).astype(int)
        df['Is_Thanksgiving'] = ((df['Month'] == 11) & (df['Day'] >= 22)).astype(int)
        df['Is_Summer'] = ((df['Month'] >= 6) & (df['Month'] <= 8)).astype(int)

        return df

    def _create_interaction_features(self, df):
        """Create interaction features"""
        # Store type and size interaction
        if 'Type' in df.columns and 'Size' in df.columns:
            df['Type_Size_interaction'] = df['Type'].astype(str) + '_' + df['Size'].astype(str)

        # Holiday and markdown interaction
        if 'IsHoliday' in df.columns and 'Total_MarkDown' in df.columns:
            df['Holiday_MarkDown'] = df['IsHoliday'].astype(int) * df['Total_MarkDown']

        return df

    def _encode_categorical(self, df):
        """Encode categorical variables"""
        for col, encoder in self.label_encoders.items():
            if col in df.columns:
                df[f'{col}_encoded'] = encoder.transform(df[col].astype(str))

        return df

# Run feature engineering experiment
with mlflow.start_run(run_name="PatchTST_Feature_Engineering"):
    print("⚙️ Starting feature engineering process...")

    # Log parameters
    mlflow.log_param("lag_periods", [1, 2, 3, 4, 5, 8, 12, 52])
    mlflow.log_param("rolling_windows", [3, 4, 8, 12, 26, 52])
    mlflow.log_param("feature_types", ["time", "lag", "rolling", "store", "economic", "holiday", "interaction"])

    # Initialize feature engineer
    feature_engineer = FeatureEngineer()

    # Fit and transform training data
    train_features = feature_engineer.fit_transform(train_cleaned)

    # Log feature engineering statistics
    original_features = train_cleaned.shape[1]
    new_features = train_features.shape[1]

    mlflow.log_metric("original_features", original_features)
    mlflow.log_metric("engineered_features", new_features)
    mlflow.log_metric("features_added", new_features - original_features)

    # Log data quality metrics
    data_quality_score = (train_features.notna().sum().sum()) / (train_features.shape[0] * train_features.shape[1])
    mlflow.log_metric("data_quality_score", data_quality_score)

    print(f"✅ Feature engineering completed!")
    print(f"📊 Features increased from {original_features} to {new_features}")
    print(f"🎯 Data quality score: {data_quality_score:.3f}")


⚙️ Starting feature engineering process...
✅ Feature engineering completed!
📊 Features increased from 16 to 80
🎯 Data quality score: 1.000


# 5. Feature Selection and Preprocessing

In [31]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """Feature selection and preprocessing for PatchTST"""

    def __init__(self, max_features=50):
        self.max_features = max_features
        self.selected_features = None
        self.scaler = RobustScaler()

    def fit(self, X, y=None):
        """Select features based on importance and correlation"""
        X_select = X.copy()

        # Remove non-numeric columns and ID columns
        exclude_cols = ['Date', 'Store', 'Dept', 'Type', 'Size_category', 'Type_Size_interaction']
        numeric_cols = X_select.select_dtypes(include=[np.number]).columns
        feature_cols = [col for col in numeric_cols if col not in exclude_cols]

        if y is not None:
            # Calculate correlation with target
            correlations = {}
            for col in feature_cols:
                if col != 'Weekly_Sales':
                    corr = abs(X_select[col].corr(y))
                    if not np.isnan(corr):
                        correlations[col] = corr

            # Sort by correlation and select top features
            sorted_features = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
            self.selected_features = [feat[0] for feat in sorted_features[:self.max_features]]
        else:
            # For test data, use all available numeric features
            self.selected_features = feature_cols[:self.max_features]

        # Always include basic features
        basic_features = ['Store', 'Dept', 'IsHoliday_int', 'Month', 'Week', 'DayOfWeek']
        for feat in basic_features:
            if feat in X_select.columns and feat not in self.selected_features:
                self.selected_features.append(feat)

        # Fit scaler on selected features
        if self.selected_features:
            self.scaler.fit(X_select[self.selected_features])

        return self

    def transform(self, X):
        """Transform data with selected features"""
        X_transformed = X[self.selected_features].copy()
        X_scaled = self.scaler.transform(X_transformed)

        # Return as DataFrame to maintain feature names
        return pd.DataFrame(X_scaled, columns=self.selected_features, index=X.index)

# Run feature selection experiment
with mlflow.start_run(run_name="PatchTST_Feature_Selection"):
    print("🎯 Starting feature selection process...")

    # Log parameters
    mlflow.log_param("selection_method", "correlation_based")
    mlflow.log_param("max_features", 50)
    mlflow.log_param("scaler_type", "RobustScaler")

    # Initialize feature selector
    feature_selector = FeatureSelector(max_features=50)

    # Fit and transform
    X_selected = feature_selector.fit_transform(train_features, train_features['Weekly_Sales'])

    # Log feature selection statistics
    mlflow.log_metric("selected_features_count", len(feature_selector.selected_features))
    mlflow.log_metric("feature_reduction_ratio", len(feature_selector.selected_features) / train_features.shape[1])

    # Log top features
    top_features = feature_selector.selected_features[:10]
    mlflow.log_param("top_10_features", top_features)

    print(f"✅ Feature selection completed!")
    print(f"📊 Selected {len(feature_selector.selected_features)} features from {train_features.shape[1]}")
    print(f"🏆 Top 10 features: {top_features}")


🎯 Starting feature selection process...
✅ Feature selection completed!
📊 Selected 54 features from 80
🏆 Top 10 features: ['Weekly_Sales_rolling_mean_3', 'Weekly_Sales_rolling_mean_4', 'Weekly_Sales_rolling_min_3', 'Weekly_Sales_rolling_min_4', 'Weekly_Sales_rolling_max_3', 'Weekly_Sales_rolling_mean_8', 'Weekly_Sales_lag_1', 'Weekly_Sales_rolling_mean_12', 'Weekly_Sales_rolling_max_4', 'Weekly_Sales_rolling_min_8']


# 6. PatchTST Model Implementation

In [32]:
class PatchTSTDataset(Dataset):
    """Dataset class for PatchTST model"""

    def __init__(self, X, y, seq_len=52, pred_len=1, stride=1):
        self.X = X.values if hasattr(X, 'values') else X
        self.y = y.values if hasattr(y, 'values') else y
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.stride = stride

        # Create sequences
        self.sequences = []
        self.targets = []

        for i in range(0, len(self.X) - seq_len - pred_len + 1, stride):
            self.sequences.append(self.X[i:i + seq_len])
            self.targets.append(self.y[i + seq_len:i + seq_len + pred_len])

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.FloatTensor(self.sequences[idx]), torch.FloatTensor(self.targets[idx])

class PatchEmbedding(nn.Module):
    """Patch embedding layer for time series"""

    def __init__(self, patch_len, stride, padding, in_channels, embed_dim):
        super().__init__()
        self.patch_len = patch_len
        self.stride = stride
        self.padding = padding

        self.conv = nn.Conv1d(in_channels, embed_dim, kernel_size=patch_len, stride=stride, padding=padding)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # x shape: (batch_size, seq_len, n_features)
        x = x.transpose(1, 2)  # (batch_size, n_features, seq_len)
        x = self.conv(x)  # (batch_size, embed_dim, n_patches)
        x = x.transpose(1, 2)  # (batch_size, n_patches, embed_dim)
        x = self.norm(x)
        return x

class PatchTST(nn.Module):
    """PatchTST model for time series forecasting"""

    def __init__(self, seq_len, pred_len, patch_len, stride, n_features,
                 embed_dim=128, num_heads=8, num_layers=3, dropout=0.1):
        super().__init__()

        self.seq_len = seq_len
        self.pred_len = pred_len
        self.patch_len = patch_len
        self.stride = stride
        self.n_features = n_features

        # Calculate number of patches
        self.n_patches = (seq_len - patch_len) // stride + 1

        # Patch embedding
        self.patch_embedding = PatchEmbedding(patch_len, stride, 0, n_features, embed_dim)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.randn(1, self.n_patches, embed_dim))

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            batch_first=True,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Prediction head
        self.prediction_head = nn.Sequential(
            nn.Linear(embed_dim * self.n_patches, embed_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim * 2, embed_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, pred_len)
        )

        # Initialize weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        """Initialize weights"""
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Conv1d):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

    def forward(self, x):
        # x shape: (batch_size, seq_len, n_features)
        batch_size = x.size(0)

        # Patch embedding
        x = self.patch_embedding(x)  # (batch_size, n_patches, embed_dim)

        # Add positional encoding
        x = x + self.positional_encoding

        # Transformer encoder
        x = self.transformer(x)  # (batch_size, n_patches, embed_dim)

        # Flatten and predict
        x = x.view(batch_size, -1)  # (batch_size, n_patches * embed_dim)
        output = self.prediction_head(x)  # (batch_size, pred_len)

        return output


# 7. Training Pipeline

In [33]:
def train_patchtst(model, train_loader, val_loader, epochs=100, lr=0.001, device='cpu'):
    """Train PatchTST model with advanced training techniques"""

    model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=len(train_loader), epochs=epochs)

    train_losses = []
    val_losses = []

    best_val_loss = float('inf')
    patience = 15
    patience_counter = 0

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        train_batches = 0

        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            train_batches += 1

        # Validation
        model.eval()
        val_loss = 0
        val_batches = 0

        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                output = model(batch_x)
                loss = criterion(output, batch_y)
                val_loss += loss.item()
                val_batches += 1

        train_loss /= train_batches
        val_loss /= val_batches

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            # Save best model
            torch.save(model.state_dict(), 'best_patchtst_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

        if epoch % 20 == 0:
            print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    # Load best model
    model.load_state_dict(torch.load('best_patchtst_model.pth'))

    return model, train_losses, val_losses


# 8. Complete Pipeline Class

In [34]:
class PatchTSTCompletePipeline:
    """Complete end-to-end pipeline for PatchTST model"""

    def __init__(self, seq_len=52, pred_len=1, patch_len=8, stride=1,
                 embed_dim=128, num_heads=8, num_layers=3, dropout=0.1,
                 max_features=50, device=None):
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.patch_len = patch_len
        self.stride = stride
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout
        self.max_features = max_features

        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')

        # Initialize all pipeline steps as None, will set later
        self.cleaner = DataCleaner()
        self.feature_engineer = FeatureEngineer()
        self.feature_selector = FeatureSelector(max_features=self.max_features)
        self.model = None
        self.scaler = None

    def fit(self, train_df, stores_df, features_df, epochs=100, batch_size=64, lr=1e-3):
        # Merge data for training
        df = train_df.merge(stores_df, on='Store', how='left')
        df = df.merge(features_df, on=['Store', 'Date'], how='left')

        # Clean data
        df_clean = self.cleaner.fit_transform(df)

        # Feature engineering
        df_feat = self.feature_engineer.fit_transform(df_clean)

        # Separate target and features
        y = df_feat['Weekly_Sales']
        X = df_feat.drop(columns=['Weekly_Sales', 'Date'])  # Drop Date as non-numeric

        # Feature selection and scaling
        X_selected = self.feature_selector.fit_transform(X, y)

        # Save scaler for later use
        self.scaler = self.feature_selector.scaler

        # Create dataset and dataloaders
        dataset = PatchTSTDataset(X_selected, y, seq_len=self.seq_len, pred_len=self.pred_len)
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_ds, val_ds = torch.utils.data.random_split(dataset, [train_size, val_size])

        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

        # Initialize model
        n_features = X_selected.shape[1]
        self.model = PatchTST(seq_len=self.seq_len, pred_len=self.pred_len,
                              patch_len=self.patch_len, stride=self.stride,
                              n_features=n_features,
                              embed_dim=self.embed_dim,
                              num_heads=self.num_heads,
                              num_layers=self.num_layers,
                              dropout=self.dropout)

        # Train model
        self.model, train_losses, val_losses = train_patchtst(
            self.model, train_loader, val_loader, epochs=epochs, lr=lr, device=self.device
        )

    def predict(self, test_df, stores_df, features_df):
        # Merge test data
        df = test_df.merge(stores_df, on='Store', how='left')
        df = df.merge(features_df, on=['Store', 'Date'], how='left')

        # Clean and engineer features
        df_clean = self.cleaner.transform(df)
        df_feat = self.feature_engineer.transform(df_clean)

        # Select features and scale
        X = df_feat.drop(columns=['Date'])
        X_selected = X[self.feature_selector.selected_features]
        X_scaled = self.scaler.transform(X_selected)
        X_scaled_df = pd.DataFrame(X_scaled, columns=self.feature_selector.selected_features)

        # Create dataset and dataloader for inference
        dataset = PatchTSTDataset(X_scaled_df, y=pd.Series([0]*len(X_scaled_df)), seq_len=self.seq_len, pred_len=self.pred_len)
        loader = DataLoader(dataset, batch_size=64, shuffle=False)

        self.model.eval()
        preds = []
        with torch.no_grad():
            for batch_x, _ in loader:
                batch_x = batch_x.to(self.device)
                output = self.model(batch_x)
                preds.append(output.cpu().numpy())

        preds = np.concatenate(preds, axis=0)
        return preds

    def save(self, path="patchtst_complete_pipeline"):
        import os
        if not os.path.exists(path):
            os.makedirs(path)
        # Save model weights
        torch.save(self.model.state_dict(), f"{path}/model.pth")
        # Save scaler and pipeline components
        joblib.dump(self.scaler, f"{path}/scaler.pkl")
        joblib.dump(self.cleaner, f"{path}/cleaner.pkl")
        joblib.dump(self.feature_engineer, f"{path}/feature_engineer.pkl")
        joblib.dump(self.feature_selector, f"{path}/feature_selector.pkl")

    def load(self, path="patchtst_complete_pipeline"):
        # Load scaler and pipeline components
        self.scaler = joblib.load(f"{path}/scaler.pkl")
        self.cleaner = joblib.load(f"{path}/cleaner.pkl")
        self.feature_engineer = joblib.load(f"{path}/feature_engineer.pkl")
        self.feature_selector = joblib.load(f"{path}/feature_selector.pkl")

        # Recreate model and load weights
        n_features = len(self.feature_selector.selected_features)
        self.model = PatchTST(seq_len=self.seq_len, pred_len=self.pred_len,
                              patch_len=self.patch_len, stride=self.stride,
                              n_features=n_features,
                              embed_dim=self.embed_dim,
                              num_heads=self.num_heads,
                              num_layers=self.num_layers,
                              dropout=self.dropout)
        self.model.load_state_dict(torch.load(f"{path}/model.pth", map_location=self.device))
        self.model.to(self.device)
        self.model.eval()
