In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required packages
!pip install wandb torch torchvision pandas numpy matplotlib seaborn scikit-learn mlflow

# Set up Kaggle API
!pip install kaggle

In [None]:
# Upload your kaggle.json to Colab and run:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the dataset
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip -q walmart-recruiting-store-sales-forecasting.zip

In [None]:
!unzip -q train.csv.zip
!unzip -q stores.csv.zip
!unzip -q test.csv.zip
!unzip -q features.csv.zip

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Deep Learning Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import MLFlowLogger

# Time Series Libraries
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, MAE, RMSE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

# MLflow for experiment tracking
import mlflow
import mlflow.pytorch
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import joblib
import os

In [None]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
pl.seed_everything(42)

print("All libraries imported successfully!")


In [None]:
# MLflow Experiment Setup
mlflow.set_tracking_uri("sqlite:///mlflow.db")
experiment_name = "TFT_Training"
mlflow.set_experiment(experiment_name)

print(f"MLflow experiment '{experiment_name}' is ready!")

In [None]:
# Data Loading and Initial Exploration
def load_data():
    """Load and explore the Walmart dataset"""
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    stores_df = pd.read_csv('stores.csv')
    features_df = pd.read_csv('features.csv')
    
    print("Dataset shapes:")
    print(f"Train: {train_df.shape}")
    print(f"Test: {test_df.shape}")
    print(f"Stores: {stores_df.shape}")
    print(f"Features: {features_df.shape}")
    
    return train_df, test_df, stores_df, features_df

# Load Data

In [None]:
# Load data
train_df, test_df, stores_df, features_df = load_data()
# Display basic info about the datasets
print("\nTrain dataset info:")
print(train_df.info())
print(f"\nTrain dataset head:\n{train_df.head()}")

print("\nTest dataset info:")
print(test_df.info())
print(f"\nTest dataset head:\n{test_df.head()}")

# MLflow Run: Data Cleaning and Preprocessing


In [None]:
with mlflow.start_run(run_name="TFT_Data_Cleaning"):
    print("Starting data cleaning and preprocessing...")
    
    # Log parameters
    mlflow.log_param("train_shape", train_df.shape)
    mlflow.log_param("test_shape", test_df.shape)
    
    # Data cleaning function
    def clean_data(df):
        """Clean the dataset"""
        # Convert Date to datetime
        df['Date'] = pd.to_datetime(df['Date'])
        
        # Handle missing values
        missing_before = df.isnull().sum().sum()
        
        # Fill missing values with appropriate methods
        if 'Weekly_Sales' in df.columns:
            # For training data
            df['Weekly_Sales'].fillna(df['Weekly_Sales'].median(), inplace=True)
        
        missing_after = df.isnull().sum().sum()
        
        print(f"Missing values before cleaning: {missing_before}")
        print(f"Missing values after cleaning: {missing_after}")
        
        return df, missing_before, missing_after
    
    # Clean training data
    train_df, missing_before_train, missing_after_train = clean_data(train_df)
    
    # Clean test data
    test_df, missing_before_test, missing_after_test = clean_data(test_df)
    
    # Log cleaning metrics
    mlflow.log_metric("missing_before_train", missing_before_train)
    mlflow.log_metric("missing_after_train", missing_after_train)
    mlflow.log_metric("missing_before_test", missing_before_test)
    mlflow.log_metric("missing_after_test", missing_after_test)
    
    print("Data cleaning completed!")



# Feature Engineering


In [None]:
with mlflow.start_run(run_name="TFT_Feature_Engineering"):
    print("Starting feature engineering...")
    
    def engineer_features(df):
        """Engineer time-based and other features"""
        # Time-based features
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Week'] = df['Date'].dt.week
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['Quarter'] = df['Date'].dt.quarter
        
        # Holiday indicator (simple approach)
        df['IsHoliday'] = df['IsHoliday'].astype(int)
        
        # Sort by Store and Date
        df = df.sort_values(['Store', 'Date']).reset_index(drop=True)
        
        return df
    
    # Engineer features
    train_df = engineer_features(train_df)
    test_df = engineer_features(test_df)
    
    # Merge with stores and features data
    train_df = train_df.merge(stores_df, on='Store', how='left')
    test_df = test_df.merge(stores_df, on='Store', how='left')
    
    train_df = train_df.merge(features_df, on=['Store', 'Date'], how='left')
    test_df = test_df.merge(features_df, on=['Store', 'Date'], how='left')
    
    # Handle categorical variables
    le_type = LabelEncoder()
    train_df['Type_encoded'] = le_type.fit_transform(train_df['Type'])
    test_df['Type_encoded'] = le_type.transform(test_df['Type'])
    
    # Fill remaining missing values
    numeric_cols = train_df.select_dtypes(include=[np.number]).columns
    train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].median())
    test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].median())
    
    # Log feature engineering metrics
    mlflow.log_param("features_after_engineering", len(train_df.columns))
    mlflow.log_param("time_features_added", 6)
    
    print(f"Feature engineering completed! New shape: {train_df.shape}")

In [None]:
# Prepare data for TFT
with mlflow.start_run(run_name="TFT_Data_Preparation"):
    print("Preparing data for TFT...")
    
    # Create time index
    train_df['time_idx'] = (train_df['Date'] - train_df['Date'].min()).dt.days
    test_df['time_idx'] = (test_df['Date'] - train_df['Date'].min()).dt.days
    
    # Define the features for TFT
    static_categoricals = ['Store', 'Type_encoded']
    static_reals = ['Size']
    time_varying_known_categoricals = ['IsHoliday', 'Month', 'Quarter', 'DayOfWeek']
    time_varying_known_reals = ['time_idx']
    time_varying_unknown_reals = ['Weekly_Sales']
    
    # Add external features if available
    external_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
    available_external = [col for col in external_features if col in train_df.columns]
    time_varying_known_reals.extend(available_external)
    
    # Create target variable
    target = 'Weekly_Sales'
    
    # Split data for validation
    max_prediction_length = 12  # 12 weeks ahead
    max_encoder_length = 52     # Use 52 weeks of history
    
    # Calculate cutoff for validation
    cutoff = train_df['time_idx'].max() - max_prediction_length
    
    # Create training and validation sets
    training_data = train_df[train_df['time_idx'] <= cutoff]
    validation_data = train_df[train_df['time_idx'] > cutoff]
    
    print(f"Training data shape: {training_data.shape}")
    print(f"Validation data shape: {validation_data.shape}")
    
    # Log data preparation parameters
    mlflow.log_param("max_prediction_length", max_prediction_length)
    mlflow.log_param("max_encoder_length", max_encoder_length)
    mlflow.log_param("training_samples", len(training_data))
    mlflow.log_param("validation_samples", len(validation_data))
    
    print("Data preparation completed!")

# Create TFT Dataset


In [None]:
with mlflow.start_run(run_name="TFT_Dataset_Creation"):
    print("Creating TFT dataset...")
    
    # Create the dataset
    training_dataset = TimeSeriesDataSet(
        train_df[train_df['time_idx'] <= cutoff],
        time_idx='time_idx',
        target=target,
        group_ids=['Store'],
        min_encoder_length=max_encoder_length // 2,
        max_encoder_length=max_encoder_length,
        min_prediction_length=1,
        max_prediction_length=max_prediction_length,
        static_categoricals=static_categoricals,
        static_reals=static_reals,
        time_varying_known_categoricals=time_varying_known_categoricals,
        time_varying_known_reals=time_varying_known_reals,
        time_varying_unknown_reals=time_varying_unknown_reals,
        target_normalizer=GroupNormalizer(groups=['Store'], transformation='softplus'),
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        allow_missing_timesteps=True,
    )
    
    # Create validation dataset
    validation_dataset = TimeSeriesDataSet.from_dataset(
        training_dataset, 
        train_df, 
        predict=True, 
        stop_randomization=True
    )
    
    # Create dataloaders
    batch_size = 128
    train_dataloader = training_dataset.to_dataloader(
        train=True, 
        batch_size=batch_size, 
        num_workers=0
    )
    val_dataloader = validation_dataset.to_dataloader(
        train=False, 
        batch_size=batch_size, 
        num_workers=0
    )
    
    print(f"Training dataset size: {len(training_dataset)}")
    print(f"Validation dataset size: {len(validation_dataset)}")
    
    # Log dataset parameters
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("train_dataset_size", len(training_dataset))
    mlflow.log_param("val_dataset_size", len(validation_dataset))
    
    print("Dataset creation completed!")



# Model Training


In [None]:

with mlflow.start_run(run_name="TFT_Model_Training"):
    print("Starting TFT model training...")
    
    # Enable MLflow auto-logging for PyTorch Lightning
    mlflow.pytorch.autolog()
    
    # Create MLflow logger
    mlflow_logger = MLFlowLogger(
        experiment_name=experiment_name,
        tracking_uri=mlflow.get_tracking_uri()
    )
    
    # Model configuration
    model_config = {
        "hidden_size": 64,
        "lstm_layers": 2,
        "dropout": 0.1,
        "attention_head_size": 4,
        "learning_rate": 0.001,
        "reduce_on_plateau_patience": 3,
        "optimizer": "Adam"
    }
    
    # Create the model
    tft = TemporalFusionTransformer.from_dataset(
        training_dataset,
        hidden_size=model_config["hidden_size"],
        lstm_layers=model_config["lstm_layers"],
        dropout=model_config["dropout"],
        attention_head_size=model_config["attention_head_size"],
        output_size=7,  # 7 quantiles by default
        loss=SMAPE(),
        learning_rate=model_config["learning_rate"],
        reduce_on_plateau_patience=model_config["reduce_on_plateau_patience"],
        optimizer=model_config["optimizer"],
    )
    
    # Log model configuration
    for key, value in model_config.items():
        mlflow.log_param(key, value)
    
    # Setup callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=True,
        mode='min'
    )
    
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        mode='min',
        save_top_k=1,
        filename='best_tft_model'
    )
    
    # Create trainer
    trainer = pl.Trainer(
        max_epochs=50,
        accelerator='cpu',  # Change to 'gpu' if available
        callbacks=[early_stopping, checkpoint_callback],
        logger=mlflow_logger,
        enable_progress_bar=True,
        deterministic=True
    )
    
     # Train the model
    trainer.fit(
        tft, 
        train_dataloaders=train_dataloader, 
        val_dataloaders=val_dataloader
    )
    
    # Load best model
    best_model = TemporalFusionTransformer.load_from_checkpoint(
        checkpoint_callback.best_model_path
    )
    
    print("Model training completed!")


# Model Evaluation

In [None]:
with mlflow.start_run(run_name="TFT_Model_Evaluation"):
    print("Starting model evaluation...")
    
    # Make predictions on validation set
    predictions = best_model.predict(val_dataloader, return_y=True)
    
    # Calculate metrics
    mae = MAE()(predictions.output, predictions.y).item()
    smape = SMAPE()(predictions.output, predictions.y).item()
    rmse = RMSE()(predictions.output, predictions.y).item()
    
    # Log evaluation metrics
    mlflow.log_metric("val_mae", mae)
    mlflow.log_metric("val_smape", smape)
    mlflow.log_metric("val_rmse", rmse)
    
    print(f"Validation MAE: {mae:.4f}")
    print(f"Validation SMAPE: {smape:.4f}")
    print(f"Validation RMSE: {rmse:.4f}")
    
    # Create prediction plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Plot 1: Actual vs Predicted
    actual = predictions.y.cpu().numpy().flatten()
    predicted = predictions.output.cpu().numpy().flatten()
    
    axes[0, 0].scatter(actual, predicted, alpha=0.5)
    axes[0, 0].plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'r--', lw=2)
    axes[0, 0].set_xlabel('Actual')
    axes[0, 0].set_ylabel('Predicted')
    axes[0, 0].set_title('Actual vs Predicted')
    
    # Plot 2: Residuals
    residuals = actual - predicted
    axes[0, 1].scatter(predicted, residuals, alpha=0.5)
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Predicted')
    axes[0, 1].set_ylabel('Residuals')
    axes[0, 1].set_title('Residual Plot')
    
    # Plot 3: Residuals histogram
    axes[1, 0].hist(residuals, bins=50, alpha=0.7)
    axes[1, 0].set_xlabel('Residuals')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Residuals Distribution')
    
    # Plot 4: Time series example
    example_idx = 0
    example_prediction = predictions.output[example_idx].cpu().numpy()
    example_actual = predictions.y[example_idx].cpu().numpy()
    
    axes[1, 1].plot(range(len(example_actual)), example_actual, 'b-', label='Actual', linewidth=2)
    axes[1, 1].plot(range(len(example_prediction)), example_prediction, 'r--', label='Predicted', linewidth=2)
    axes[1, 1].set_xlabel('Time Steps')
    axes[1, 1].set_ylabel('Weekly Sales')
    axes[1, 1].set_title('Example Prediction')
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.savefig('tft_evaluation_plots.png', dpi=300, bbox_inches='tight')
    mlflow.log_artifact('tft_evaluation_plots.png')
    plt.show()
    
    print("Model evaluation completed!")



In [None]:
# Hyperparameter Tuning (Optional)
with mlflow.start_run(run_name="TFT_Hyperparameter_Tuning"):
    print("Starting hyperparameter tuning...")
    
    # Define hyperparameter ranges
    study = optimize_hyperparameters(
        train_dataloader,
        val_dataloader,
        model_path="optuna_test",
        n_trials=10,  # Reduce for faster execution
        max_epochs=20,
        gradient_clip_val_range=(0.01, 1.0),
        hidden_size_range=(32, 128),
        lstm_layers_range=(1, 4),
        dropout_range=(0.1, 0.3),
        attention_head_size_range=(1, 8),
        learning_rate_range=(0.001, 0.1),
        use_learning_rate_finder=False,
    )
    
    # Log best parameters
    best_params = study.best_params
    for key, value in best_params.items():
        mlflow.log_param(f"best_{key}", value)
    
    mlflow.log_metric("best_trial_value", study.best_value)
    
    print(f"Best trial value: {study.best_value}")
    print(f"Best parameters: {best_params}")

# Final Model Training with Best Parameters
with mlflow.start_run(run_name="TFT_Final_Model_Training"):
    print("Training final model with best parameters...")
    
    # Create final model with best parameters (use default if tuning was skipped)
    final_tft = TemporalFusionTransformer.from_dataset(
        training_dataset,
        hidden_size=64,  # Use best params if available
        lstm_layers=2,
        dropout=0.1,
        attention_head_size=4,
        output_size=7,
        loss=SMAPE(),
        learning_rate=0.001,
        reduce_on_plateau_patience=3,
        optimizer="Adam",
    )
    
    # Create final trainer
    final_trainer = pl.Trainer(
        max_epochs=100,
        accelerator='cpu',
        callbacks=[early_stopping, checkpoint_callback],
        logger=mlflow_logger,
        enable_progress_bar=True,
        deterministic=True
    )
    
    # Train final model
    final_trainer.fit(
        final_tft,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader
    )
    
    # Load best final model
    final_best_model = TemporalFusionTransformer.load_from_checkpoint(
        checkpoint_callback.best_model_path
    )
    
    print("Final model training completed!")

# Create Pipeline and Save Model


In [None]:
with mlflow.start_run(run_name="TFT_Pipeline_Creation"):
    print("Creating TFT pipeline...")
    
    # Create a pipeline class for TFT
    class TFTPipeline:
        def __init__(self, model, dataset_config, preprocessing_params):
            self.model = model
            self.dataset_config = dataset_config
            self.preprocessing_params = preprocessing_params
            self.label_encoders = {}
        
        def preprocess(self, data):
            """Preprocess raw data for TFT"""
            # Apply the same preprocessing as training
            data = data.copy()
            
            # Convert Date to datetime
            data['Date'] = pd.to_datetime(data['Date'])
            
            # Engineer features
            data['Year'] = data['Date'].dt.year
            data['Month'] = data['Date'].dt.month
            data['Week'] = data['Date'].dt.week
            data['Day'] = data['Date'].dt.day
            data['DayOfWeek'] = data['Date'].dt.dayofweek
            data['Quarter'] = data['Date'].dt.quarter
            data['IsHoliday'] = data['IsHoliday'].astype(int)
            
            # Create time index
            data['time_idx'] = (data['Date'] - self.preprocessing_params['min_date']).dt.days
            
            # Handle categorical encoding
            if 'Type' in data.columns:
                data['Type_encoded'] = le_type.transform(data['Type'])
            
            # Fill missing values
            numeric_cols = data.select_dtypes(include=[np.number]).columns
            data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
            
            return data
        
        def predict(self, data):
            """Make predictions on new data"""
            # Preprocess data
            processed_data = self.preprocess(data)
            
            # Create dataset for prediction
            prediction_dataset = TimeSeriesDataSet.from_dataset(
                self.dataset_config,
                processed_data,
                predict=True,
                stop_randomization=True
            )
            
            # Create dataloader
            prediction_dataloader = prediction_dataset.to_dataloader(
                train=False,
                batch_size=128,
                num_workers=0
            )
            
            # Make predictions
            predictions = self.model.predict(prediction_dataloader)
            
            return predictions
    
    # Create pipeline
    preprocessing_params = {
        'min_date': train_df['Date'].min(),
        'max_date': train_df['Date'].max(),
        'features': list(train_df.columns)
    }
    
    tft_pipeline = TFTPipeline(
        model=final_best_model,
        dataset_config=training_dataset,
        preprocessing_params=preprocessing_params
    )
    
    # Save pipeline
    pipeline_path = "tft_pipeline.pkl"
    joblib.dump(tft_pipeline, pipeline_path)
    
    # Log pipeline
    mlflow.log_artifact(pipeline_path)
    
    # Save additional components
    joblib.dump(le_type, "label_encoder_type.pkl")
    mlflow.log_artifact("label_encoder_type.pkl")
    
    print("Pipeline creation completed!")

# Model Registration


In [None]:
with mlflow.start_run(run_name="TFT_Model_Registration"):
    print("Registering model...")
    
    # Create model signature
    sample_input = train_df.head(100)
    sample_output = np.random.randn(100, max_prediction_length)
    signature = infer_signature(sample_input, sample_output)
    
    # Register model
    model_name = "TFT_Walmart_Sales_Forecast"
    
    mlflow.sklearn.log_model(
        sk_model=tft_pipeline,
        artifact_path="tft_model",
        signature=signature,
        registered_model_name=model_name
    )
    
    print(f"Model registered as '{model_name}'")

In [None]:
print("TFT experiment completed successfully!")
print("All artifacts and models have been logged to MLflow")
print("Check your MLflow UI to view the experiments and model registry")