<a href="https://colab.research.google.com/github/ekvirika/WalmartRecruiting/blob/main/notebooks/model_experiment_dlinear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install required packages
!pip install wandb torch torchvision pandas numpy matplotlib seaborn scikit-learn mlflow

# Set up Kaggle API
!pip install kaggle



In [3]:
# Upload your kaggle.json to Colab and run:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [4]:
# Download the dataset
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip -q walmart-recruiting-store-sales-forecasting.zip

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace sampleSubmission.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace stores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace train.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [5]:
!unzip -q train.csv.zip
!unzip -q stores.csv.zip
!unzip -q test.csv.zip
!unzip -q features.csv.zip

replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
unzip:  cannot find or open stores.csv.zip, stores.csv.zip.zip or stores.csv.zip.ZIP.
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import mlflow
import mlflow.pytorch
import warnings
warnings.filterwarnings('ignore')

# 1. DATA PREPROCESSING CLASS

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

class WalmartDataPreprocessor:
    """
    Data preprocessing class for Walmart sales data
    Handles cleaning, feature engineering, and time series preparation
    """

    def __init__(self):
        self.scaler = StandardScaler()
        self.feature_columns = None
        self.target_column = 'Weekly_Sales'

    def load_data(self, train_path, test_path, stores_path, features_path):
        """Load all datasets and merge them"""
        # Load datasets
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        stores = pd.read_csv(stores_path)
        features = pd.read_csv(features_path)

        # Merge datasets
        train = train.merge(stores, on='Store', how='left')
        train = train.merge(features, on=['Store', 'Date'], how='left')

        test = test.merge(stores, on='Store', how='left')
        test = test.merge(features, on=['Store', 'Date'], how='left')

        # Fix IsHoliday duplicates after merge
        for df in [train, test]:
            if 'IsHoliday_x' in df.columns and 'IsHoliday_y' in df.columns:
                df['IsHoliday'] = df['IsHoliday_x']  # or 'IsHoliday_y' if preferred
                df.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1, inplace=True)

        return train, test

    def create_time_features(self, df):
        """Create time-based features from Date column"""
        df['Date'] = pd.to_datetime(df['Date'])
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek

        # Ensure IsHoliday is binary
        df['IsHoliday'] = df['IsHoliday'].apply(lambda x: 1 if x in [True, 1, 'True', '1'] else 0)

        # Create cyclical features
        df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
        df['Week_sin'] = np.sin(2 * np.pi * df['Week'] / 52)
        df['Week_cos'] = np.cos(2 * np.pi * df['Week'] / 52)
        df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
        df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)

        return df

    def handle_missing_values(self, df):
        """Handle missing values in the dataset"""
        # Fill numeric columns with median
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            if col != self.target_column:
                df[col] = df[col].fillna(df[col].median())

        # Fill categorical columns with mode
        categorical_columns = df.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            if df[col].mode().empty:
                df[col] = df[col].fillna('Unknown')
            else:
                df[col] = df[col].fillna(df[col].mode()[0])

        return df

    def create_lag_features(self, df, lags=[1, 2, 4]):
        """Create lag features for time series"""
        df = df.sort_values(['Store', 'Dept', 'Date'])

        # Lag features
        for lag in lags:
            df[f'Sales_lag_{lag}'] = df.groupby(['Store', 'Dept'])[self.target_column].shift(lag)

        # Rolling features (use transform to keep index aligned)
        for window in [4, 8]:
            df[f'Sales_rolling_mean_{window}'] = df.groupby(['Store', 'Dept'])[self.target_column].transform(lambda x: x.shift(1).rolling(window).mean())
            df[f'Sales_rolling_std_{window}'] = df.groupby(['Store', 'Dept'])[self.target_column].transform(lambda x: x.shift(1).rolling(window).std())

        return df


    def prepare_features(self, df, is_train=True):
        """Prepare final feature set"""
        # Time features
        df = self.create_time_features(df)

        # Missing values
        df = self.handle_missing_values(df)

        # Lag features
        if is_train:
            df = self.create_lag_features(df)

        # Initial feature set
        feature_cols = [
            'Store', 'Dept', 'Size', 'Type',
            'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
            'CPI', 'Unemployment', 'IsHoliday',
            'Year', 'Month', 'Week', 'Day', 'DayOfWeek',
            'Month_sin', 'Month_cos', 'Week_sin', 'Week_cos', 'DayOfWeek_sin', 'DayOfWeek_cos'
        ]

        # Add lag/rolling features
        lag_features = [col for col in df.columns if 'lag' in col or 'rolling' in col]
        feature_cols.extend(lag_features)

        # One-hot encode Type
        df = pd.get_dummies(df, columns=['Type'], prefix='Type')

        # Replace 'Type' with dummy column names
        feature_cols = [col for col in feature_cols if col != 'Type']
        feature_cols.extend([col for col in df.columns if col.startswith('Type_')])

        # Retain only existing columns
        feature_cols = [col for col in feature_cols if col in df.columns]

        # Save selected features
        self.feature_columns = feature_cols

        # ✅ Always retain 'Date', 'Store', and 'Dept' for dataset construction
        essential_cols = ['Date', 'Store', 'Dept']
        for col in essential_cols:
            if col not in feature_cols:
                feature_cols.insert(0, col)

        # Final return
        if is_train:
            return df[feature_cols + [self.target_column]]
        else:
            return df[feature_cols]


# 2. DLINEAR MODEL IMPLEMENTATION

In [8]:
class DLinear(nn.Module):
    """
    DLinear: Decomposition Linear Model for Time Series Forecasting

    The model decomposes the time series into trend and seasonal components
    and applies linear layers to each component separately.
    """

    def __init__(self, seq_len, pred_len, enc_in, individual=False, kernel_size=25):
        super(DLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.individual = individual
        self.kernel_size = kernel_size

        # Decomposition
        self.decomposition = SeriesDecomposition(kernel_size)

        if self.individual:
            self.Linear_Seasonal = nn.ModuleList([
                nn.Linear(self.seq_len, self.pred_len) for _ in range(enc_in)
            ])
            self.Linear_Trend = nn.ModuleList([
                nn.Linear(self.seq_len, self.pred_len) for _ in range(enc_in)
            ])
        else:
            self.Linear_Seasonal = nn.Linear(self.seq_len, self.pred_len)
            self.Linear_Trend = nn.Linear(self.seq_len, self.pred_len)

    def forward(self, x):
        # x: [Batch, Input length, Channel]
        seasonal_init, trend_init = self.decomposition(x)

        seasonal_init = seasonal_init.permute(0, 2, 1)  # [Batch, Channel, Input length]
        trend_init = trend_init.permute(0, 2, 1)

        if self.individual:
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.pred_len],
                                        dtype=seasonal_init.dtype).to(seasonal_init.device)
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.pred_len],
                                     dtype=trend_init.dtype).to(trend_init.device)

            for i in range(seasonal_init.size(1)):
                seasonal_output[:, i, :] = self.Linear_Seasonal[i](seasonal_init[:, i, :])
                trend_output[:, i, :] = self.Linear_Trend[i](trend_init[:, i, :])
        else:
            seasonal_output = self.Linear_Seasonal(seasonal_init)
            trend_output = self.Linear_Trend(trend_init)

        x = seasonal_output + trend_output
        return x.permute(0, 2, 1)  # [Batch, Output length, Channel]


class SeriesDecomposition(nn.Module):
    """
    Series decomposition block for separating trend and seasonal components
    """

    def __init__(self, kernel_size):
        super(SeriesDecomposition, self).__init__()
        self.moving_avg = MovingAverage(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        res = x - moving_mean
        return res, moving_mean


class MovingAverage(nn.Module):
    """
    Moving average block for trend extraction
    """

    def __init__(self, kernel_size, stride):
        super(MovingAverage, self).__init__()
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        # x: [batch, seq_len, features]
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x


# 3. DATASET CLASS

In [9]:
class WalmartTimeSeriesDataset(Dataset):
    """
    Custom dataset class for Walmart time series data
    Creates sequences for training DLinear model
    """

    def __init__(self, data, seq_len=52, pred_len=1, target_col='Weekly_Sales'):
        self.data = data
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.target_col = target_col

        # Prepare sequences
        self.sequences = self._create_sequences()

    def _create_sequences(self):
        sequences = []

        # Group by Store and Dept for creating sequences
        for (store, dept), group in self.data.groupby(['Store', 'Dept']):
            group = group.sort_values('Date')

            # Skip if not enough data
            if len(group) < self.seq_len + self.pred_len:
                continue

            # Create sequences
            for i in range(len(group) - self.seq_len - self.pred_len + 1):
                seq_data = group.iloc[i:i+self.seq_len]
                target_data = group.iloc[i+self.seq_len:i+self.seq_len+self.pred_len]

                # Extract features (excluding target column)
                feature_cols = [col for col in group.columns if col != self.target_col and col not in ['Date', 'Store', 'Dept']]

                x = seq_data[feature_cols].values
                y = target_data[self.target_col].values

                sequences.append((x, y, store, dept))

        return sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x, y, store, dept = self.sequences[idx]
        return torch.FloatTensor(x), torch.FloatTensor(y), store, dept


# 4. TRAINING CLASS

In [10]:
class DLinearTrainer:
    """
    Training class for DLinear model with MLflow logging
    """

    def __init__(self, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.device = device
        self.best_loss = float('inf')

    def train_epoch(self, dataloader, optimizer, criterion):
        """Train for one epoch"""
        self.model.train()
        total_loss = 0

        for batch_idx, (x, y, _, _) in enumerate(dataloader):
            x, y = x.to(self.device), y.to(self.device)

            optimizer.zero_grad()
            output = self.model(x)

            # Reshape output to match target
            output = output.squeeze(-1)  # Remove last dimension if it's 1
            loss = criterion(output, y)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        return total_loss / len(dataloader)

    def validate(self, dataloader, criterion):
        """Validate the model"""
        self.model.eval()
        total_loss = 0

        with torch.no_grad():
            for x, y, _, _ in dataloader:
                x, y = x.to(self.device), y.to(self.device)
                output = self.model(x)
                output = output.squeeze(-1)
                loss = criterion(output, y)
                total_loss += loss.item()

        return total_loss / len(dataloader)

    def fit(self, train_loader, val_loader, epochs=100, lr=0.001):
        """Train the model with MLflow logging"""
        optimizer = optim.Adam(self.model.parameters(), lr=lr)
        criterion = nn.MSELoss()

        # MLflow logging
        mlflow.log_param("learning_rate", lr)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("optimizer", "Adam")

        for epoch in range(epochs):
            train_loss = self.train_epoch(train_loader, optimizer, criterion)
            val_loss = self.validate(val_loader, criterion)

            # Log metrics
            mlflow.log_metric("train_loss", train_loss, step=epoch)
            mlflow.log_metric("val_loss", val_loss, step=epoch)

            # Save best model
            if val_loss < self.best_loss:
                self.best_loss = val_loss
                torch.save(self.model.state_dict(), 'best_dlinear_model.pth')
                mlflow.log_metric("best_val_loss", val_loss, step=epoch)

            if epoch % 10 == 0:
                print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        return self.model


# 5. MAIN PIPELINE CLASS

In [11]:
class DLinearPipeline:
    """
    Main pipeline class that orchestrates the entire DLinear workflow
    """

    def __init__(self, seq_len=52, pred_len=1):
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.preprocessor = WalmartDataPreprocessor()
        self.model = None
        self.trainer = None

    def load_and_prepare_data(self, train_path, test_path, stores_path, features_path):
        """Load and prepare data for training"""
        print("Loading and preparing data...")

        # Load data
        train_data, test_data = self.preprocessor.load_data(
            train_path, test_path, stores_path, features_path
        )

        # Prepare features
        train_prepared = self.preprocessor.prepare_features(train_data, is_train=True)
        test_prepared = self.preprocessor.prepare_features(test_data, is_train=False)

        # Remove rows with NaN values (from lag features)
        train_prepared = train_prepared.dropna()

        return train_prepared, test_prepared

    def create_datasets(self, train_data, val_split=0.2):
        """Create train and validation datasets"""
        print("Creating datasets...")

        # Split data
        train_size = int(len(train_data) * (1 - val_split))
        train_df = train_data.iloc[:train_size]
        val_df = train_data.iloc[train_size:]

        # Create datasets
        train_dataset = WalmartTimeSeriesDataset(train_df, self.seq_len, self.pred_len)
        val_dataset = WalmartTimeSeriesDataset(val_df, self.seq_len, self.pred_len)

        return train_dataset, val_dataset

    def build_model(self, input_dim):
        """Build DLinear model"""
        print("Building DLinear model...")

        self.model = DLinear(
            seq_len=self.seq_len,
            pred_len=self.pred_len,
            enc_in=input_dim,
            individual=False,
            kernel_size=25
        )

        return self.model

    def train_model(self, train_dataset, val_dataset, batch_size=32, epochs=100, lr=0.001):
        """Train the model"""
        print("Training DLinear model...")

        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Get input dimension from first batch
        sample_batch = next(iter(train_loader))
        input_dim = sample_batch[0].shape[-1]

        # Build model
        self.model = self.build_model(input_dim)

        # Initialize trainer
        self.trainer = DLinearTrainer(self.model)

        # Train model
        self.model = self.trainer.fit(train_loader, val_loader, epochs, lr)

        return self.model

    def predict(self, test_data):
        """Make predictions on test data"""
        print("Making predictions...")

        self.model.eval()
        predictions = []

        # Create test dataset
        test_dataset = WalmartTimeSeriesDataset(test_data, self.seq_len, self.pred_len)
        test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

        with torch.no_grad():
            for x, _, store, dept in test_loader:
                x = x.to(self.trainer.device)
                output = self.model(x)
                pred = output.squeeze().cpu().numpy()
                predictions.append({
                    'Store': store,
                    'Dept': dept,
                    'Prediction': pred
                })

        return predictions

    def run_experiment(self, train_path, test_path, stores_path, features_path,
                      experiment_name="DLinear_Training"):
        """Run complete experiment with MLflow tracking"""

        # Start MLflow run
        mlflow.set_experiment(experiment_name)

        with mlflow.start_run(run_name="DLinear_Full_Pipeline"):
            # Log parameters
            mlflow.log_param("seq_len", self.seq_len)
            mlflow.log_param("pred_len", self.pred_len)
            mlflow.log_param("model_type", "DLinear")

            # Step 1: Load and prepare data
            train_data, test_data = self.load_and_prepare_data(
                train_path, test_path, stores_path, features_path
            )

            # Step 2: Create datasets
            train_dataset, val_dataset = self.create_datasets(train_data)

            # Step 3: Train model
            model = self.train_model(train_dataset, val_dataset)

            # Step 4: Log model
            mlflow.pytorch.log_model(model, "dlinear_model")

            # Step 5: Make predictions
            predictions = self.predict(test_data)

            print("Experiment completed successfully!")

            return model, predictions


# 6. USAGE EXAMPLE

In [None]:
def main():
    """
    Main function to run the DLinear pipeline
    """
    # Initialize pipeline
    pipeline = DLinearPipeline(seq_len=52, pred_len=1)

    # Define file paths
    train_path = "train.csv"
    test_path = "test.csv"
    stores_path = "stores.csv"
    features_path = "features.csv"

    # Run experiment
    model, predictions = pipeline.run_experiment(
        train_path, test_path, stores_path, features_path
    )

    # Save predictions
    pred_df = pd.DataFrame(predictions)
    pred_df.to_csv("dlinear_predictions.csv", index=False)

    print("Pipeline completed successfully!")

if __name__ == "__main__":
    main()

Loading and preparing data...
Creating datasets...
