## Sequence Sampling Forecasting on the Multichannel Dataset

This notebook implements the sequence sampling forecasting pipeline introduced in the accompanying thesis. It is designed to operate on individual customer histories by segmenting their transaction records into overlapping input-output windows, thereby increasing temporal density and mitigating sparsity in the training data.

The workflow proceeds as follows:
1. **Environment Setup**: Load essential libraries and define utility functions.
2. **Data Preparation**: Construct input-output pairs via sliding window sampling for each customer within the defined cohort and time ranges.
3. **Model Definition**: Specify the Transformer-based forecasting model, incorporating customer-level embeddings and temporal features.
4. **Training & Evaluation**: Train the model on the sampled sequences, validate performance using held-out segments, and visualize training dynamics.
5. **Execution Flow**: A main execution block coordinates data preparation, model initialization, training, and evaluation.

All key configurations—including sampling stride, window size, forecasting horizon, and model hyperparameters—are adjustable.


## 1. Setup and Imports

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import gc
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

from torch.cuda.amp import autocast, GradScaler

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.distributions as dist
from torch.optim.lr_scheduler import OneCycleLR

# For metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

# For metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib.colors import LinearSegmentedColormap


# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 2. Data Loading and Preprocessing

In [None]:
class SlidingWindowDataset(Dataset):
    """Dataset for sliding window transaction data"""
    def __init__(self, X, y, customer_indices):
        self.X = X
        self.y = y
        self.customer_indices = customer_indices

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.customer_indices[idx]

def create_dataloaders(split_data, batch_size=32):
    """
    Create DataLoaders for training, validation, and test sets.

    Parameters:
    -----------
    split_data : dict
        Dictionary containing train/val/test splits
    batch_size : int
        Batch size for DataLoaders

    Returns:
    --------
    dict
        Dictionary containing DataLoaders
    """
    train_dataset = SlidingWindowDataset(
        split_data['train']['X'],
        split_data['train']['y'],
        split_data['train']['customer_indices']
    )

    val_dataset = SlidingWindowDataset(
        split_data['val']['X'],
        split_data['val']['y'],
        split_data['val']['customer_indices']
    )

    test_dataset = SlidingWindowDataset(
        split_data['test']['X'],
        split_data['test']['y'],
        split_data['test']['customer_indices']
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }

In [None]:
def train_val_test_split(data, train_ratio=0.7, val_ratio=0.15, random_seed=42):
    """
    Split data into training, validation, and test sets.

    Parameters:
    -----------
    data : dict
        Dictionary containing 'X', 'y', and 'customer_indices'
    train_ratio : float
        Proportion of data to use for training
    val_ratio : float
        Proportion of data to use for validation
    random_seed : int
        Random seed for reproducibility

    Returns:
    --------
    dict
        Dictionary containing train/val/test splits
    """
    # Set random seed
    np.random.seed(random_seed)

    # Get total number of sequences
    n_sequences = len(data['X'])

    # Generate random indices for shuffling
    indices = np.random.permutation(n_sequences)

    # Calculate split points
    train_end = int(n_sequences * train_ratio)
    val_end = int(n_sequences * (train_ratio + val_ratio))

    # Split indices
    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]

    # Create splits
    train_data = {
        'X': data['X'][train_indices],
        'y': data['y'][train_indices],
        'customer_indices': data['customer_indices'][train_indices]
    }

    val_data = {
        'X': data['X'][val_indices],
        'y': data['y'][val_indices],
        'customer_indices': data['customer_indices'][val_indices]
    }

    test_data = {
        'X': data['X'][test_indices],
        'y': data['y'][test_indices],
        'customer_indices': data['customer_indices'][test_indices]
    }

    print(f"Data split: {len(train_indices)} train, {len(val_indices)} validation, {len(test_indices)} test sequences")

    return {
        'train': train_data,
        'val': val_data,
        'test': test_data,
        'metadata': {
            'train_ratio': train_ratio,
            'val_ratio': val_ratio,
            'test_ratio': 1 - train_ratio - val_ratio,
            'random_seed': random_seed
        }
    }


In [None]:
def fixed_timeframe_sliding_window(df, training_input_period, training_target_period,
                                  prediction_input_period, prediction_target_period,
                                  window_size=16, step_size=4,
                                  customer_field='CUSTNO', date_field='Date',
                                  cohort_field=None, cohort_range=None):
    """
    Prepare sequence data using sliding window approach with fixed time frames.

    Parameters:
    -----------
    df : DataFrame
        Transaction data
    training_input_period : tuple
        (start_date, end_date) for training input period
    training_target_period : tuple
        (start_date, end_date) for training target period
    prediction_input_period : tuple
        (start_date, end_date) for prediction input period
    prediction_target_period : tuple
        (start_date, end_date) for prediction target period
    window_size : int
        Size of input sequence window
    step_size : int
        Step size for sliding window (controls overlap)
    customer_field : str
        Column name for customer ID
    date_field : str
        Column name for transaction date
    cohort_field : str or None
        Column name for cohort information
    cohort_range : tuple or None
        Range of cohorts to include (min_cohort, max_cohort)

    Returns:
    --------
    dict
        Dictionary containing processed data
    """
    # Convert date periods to datetime
    training_input_start, training_input_end = pd.to_datetime(training_input_period)
    training_target_start, training_target_end = pd.to_datetime(training_target_period)
    prediction_input_start, prediction_input_end = pd.to_datetime(prediction_input_period)
    prediction_target_start, prediction_target_end = pd.to_datetime(prediction_target_period)

    print(f"Training input: {training_input_start} to {training_input_end}")
    print(f"Training target: {training_target_start} to {training_target_end}")
    print(f"Prediction input: {prediction_input_start} to {prediction_input_end}")
    print(f"Prediction target: {prediction_target_start} to {prediction_target_end}")

    # Create date ranges
    training_input_dates = pd.date_range(training_input_start, training_input_end, freq='W')
    training_target_dates = pd.date_range(training_target_start, training_target_end, freq='W')
    prediction_input_dates = pd.date_range(prediction_input_start, prediction_input_end, freq='W')
    prediction_target_dates = pd.date_range(prediction_target_start, prediction_target_end, freq='W')

    # Apply cohort filtering if specified
    if cohort_range is not None and cohort_field is not None:
        min_cohort, max_cohort = cohort_range

        # Check data type of cohort_field and convert if necessary
        cohort_dtype = df[cohort_field].dtype

        if cohort_dtype == 'object' or cohort_dtype == 'string':
            # If cohort is string/object type, convert range values to string
            min_cohort_val = str(min_cohort)
            max_cohort_val = str(max_cohort)
        elif 'datetime' in str(cohort_dtype):
            # If cohort is datetime, ensure range values are datetime
            min_cohort_val = pd.to_datetime(min_cohort)
            max_cohort_val = pd.to_datetime(max_cohort)
        else:
            # For numeric types
            if isinstance(min_cohort, str):
                # If they're strings but should be numeric, convert the column
                df[cohort_field] = pd.to_numeric(df[cohort_field], errors='coerce')
                min_cohort_val = float(min_cohort)
                max_cohort_val = float(max_cohort)
            else:
                # Otherwise use the values as-is
                min_cohort_val = min_cohort
                max_cohort_val = max_cohort

        # Apply the filter with proper types
        df_filtered = df[(df[cohort_field] >= min_cohort_val) & (df[cohort_field] <= max_cohort_val)]
        print(f"Filtered to cohorts {min_cohort}-{max_cohort}: {len(df_filtered[customer_field].unique())} customers")
    else:
        df_filtered = df
        print(f"Using all cohorts: {len(df_filtered[customer_field].unique())} customers")

    # Create customer mapping
    customer_ids = df_filtered[customer_field].unique()
    customer_to_idx = {cid: idx for idx, cid in enumerate(customer_ids)}

    # Initialize lists to store data
    X_train_windows = []
    y_train_windows = []
    X_pred_windows = []
    y_pred_windows = []
    customer_indices_train = []
    customer_indices_pred = []
    customer_cohorts = []

    # Process each customer
    train_valid_customers = 0
    pred_valid_customers = 0
    total_train_sequences = 0
    total_pred_sequences = 0

    print(f"Processing {len(customer_ids)} customers with sliding window approach...")

    for customer_id in tqdm(customer_ids):
        # Filter customer transactions
        customer_df = df_filtered[df_filtered[customer_field] == customer_id].copy()

        # Get customer cohort if available
        if cohort_field is not None:
            customer_cohort = customer_df[cohort_field].iloc[0]
        else:
            customer_cohort = 0

        # Aggregate by week
        weekly_counts = customer_df.groupby(pd.Grouper(key=date_field, freq='W')).size().to_frame('transactions')

        # Create templates with all dates
        train_input_template = pd.DataFrame(index=training_input_dates)
        train_target_template = pd.DataFrame(index=training_target_dates)
        pred_input_template = pd.DataFrame(index=prediction_input_dates)
        pred_target_template = pd.DataFrame(index=prediction_target_dates)

        # Merge transaction counts
        train_input_data = train_input_template.join(weekly_counts).fillna(0)['transactions'].values
        train_target_data = train_target_template.join(weekly_counts).fillna(0)['transactions'].values
        pred_input_data = pred_input_template.join(weekly_counts).fillna(0)['transactions'].values
        pred_target_data = pred_target_template.join(weekly_counts).fillna(0)['transactions'].values

        # Training data - create sliding windows
        if len(train_input_data) >= window_size:
            train_valid_customers += 1

            for start_idx in range(0, len(train_input_data) - window_size + 1, step_size):
                end_idx = start_idx + window_size

                # Ensure we don't go out of bounds
                if end_idx <= len(train_input_data):
                    X_window = train_input_data[start_idx:end_idx]
                    y_window = train_target_data  # Use the entire target period

                    X_train_windows.append(X_window)
                    y_train_windows.append(y_window)
                    customer_indices_train.append(customer_to_idx[customer_id])
                    customer_cohorts.append(customer_cohort)
                    total_train_sequences += 1

        # Prediction data - create sliding windows
        if len(pred_input_data) >= window_size:
            pred_valid_customers += 1

            for start_idx in range(0, len(pred_input_data) - window_size + 1, step_size):
                end_idx = start_idx + window_size

                # Ensure we don't go out of bounds
                if end_idx <= len(pred_input_data):
                    X_window = pred_input_data[start_idx:end_idx]
                    y_window = pred_target_data  # Use the entire target period

                    X_pred_windows.append(X_window)
                    y_pred_windows.append(y_window)
                    customer_indices_pred.append(customer_to_idx[customer_id])
                    total_pred_sequences += 1

    print(f"Created {total_train_sequences} train sequences from {train_valid_customers} customers")
    print(f"Created {total_pred_sequences} prediction sequences from {pred_valid_customers} customers")

    # Convert to tensors
    X_train_tensor = torch.tensor(np.array(X_train_windows), dtype=torch.float32).unsqueeze(-1)
    y_train_tensor = torch.tensor(np.array(y_train_windows), dtype=torch.float32)
    X_pred_tensor = torch.tensor(np.array(X_pred_windows), dtype=torch.float32).unsqueeze(-1)
    y_pred_tensor = torch.tensor(np.array(y_pred_windows), dtype=torch.float32)

    train_customer_indices_tensor = torch.tensor(customer_indices_train, dtype=torch.long)
    pred_customer_indices_tensor = torch.tensor(customer_indices_pred, dtype=torch.long)

    return {
        'X_train': X_train_tensor,
        'y_train': y_train_tensor,
        'X_pred': X_pred_tensor,
        'y_pred': y_pred_tensor,
        'train_customer_indices': train_customer_indices_tensor,
        'pred_customer_indices': pred_customer_indices_tensor,
        'customer_mapping': customer_to_idx,
        'num_customers': len(customer_to_idx),
        'customer_cohorts': customer_cohorts,
        'window_size': window_size,
        'training_input_dates': training_input_dates,
        'training_target_dates': training_target_dates,
        'prediction_input_dates': prediction_input_dates,
        'prediction_target_dates': prediction_target_dates
    }

##Architecture

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)  # Store on same device as model

    def forward(self, x):
        return x + self.pe[:x.size(1), :].to(x.device)  # Move to correct device


class SlidingWindowTransformer(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers, num_heads,
                output_dim, num_customers=None, dropout=0.1):
        super(SlidingWindowTransformer, self).__init__()

        # Standard components
        self.input_projection = nn.Linear(input_dim, embed_dim)
        self.pos_encoding = PositionalEncoding(embed_dim)
        self.customer_embedding = nn.Embedding(num_customers+1, embed_dim)

        # Transaction event detector
        self.transaction_detector = nn.Sequential(
            nn.Linear(input_dim, embed_dim),
            nn.LayerNorm(embed_dim),
            nn.GELU()
        )

        # Transformer encoder
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        # Output projection
        self.output_projection = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
            nn.Softplus()  # Ensure non-negative predictions
        )

        # Scale factor for prediction (learnable)
        self.log_output_scale = nn.Parameter(torch.tensor(0.0))

    def forward(self, x, customer_ids, time_indices=None):
        # Ensure correct dimensions
        if x.dim() == 2:
            x = x.unsqueeze(-1)

        batch_size, seq_len = x.size(0), x.size(1)

        # Transaction mask - highlight where transactions occurred
        transaction_mask = (x > 0).float()

        # Customer embedding
        cust_embed = self.customer_embedding(customer_ids).unsqueeze(1).expand(-1, seq_len, -1)

        # Process input
        x_embed = self.input_projection(x)
        x_embed = self.pos_encoding(x_embed)

        # Add transaction signal
        trans_signal = self.transaction_detector(x) * transaction_mask
        x_embed = x_embed + cust_embed + trans_signal * 0.5

        # Add time information if available
        if time_indices is not None:
            # Handle time indices shape
            if time_indices.dim() == 1:
                if time_indices.size(0) != seq_len:
                    time_indices = time_indices[:seq_len] if time_indices.size(0) > seq_len else torch.cat([
                        time_indices,
                        time_indices[-1].repeat(seq_len - time_indices.size(0))
                    ])
                time_indices = time_indices.unsqueeze(0).expand(batch_size, -1)

            # Use week of year as a feature (1-52)
            week_of_year = (time_indices % 52) + 1
            week_embed = torch.zeros((batch_size, seq_len, x_embed.size(-1)), device=x_embed.device)

            # Create simple encoding - add a small signal based on week of year
            for i in range(batch_size):
                for j in range(seq_len):
                    week = week_of_year[i, j].item()
                    # Add signal for holiday seasons (weeks 50-52)
                    if week >= 50:
                        week_embed[i, j] += 0.2
                    # Add signal for mid-year (weeks 25-27)
                    elif 25 <= week <= 27:
                        week_embed[i, j] += 0.1

            x_embed = x_embed + week_embed

        # Apply transformer
        x_embed = self.transformer_encoder(x_embed)

        # Use the entire context for prediction
        # (changed from using only the last token in original model)
        final_hidden = x_embed.mean(dim=1)  # Average pooling over sequence length

        # Generate and scale output
        output = self.output_projection(final_hidden)
        output = output * torch.exp(self.log_output_scale)

        return output

## Training Functions

In [None]:
def enhanced_sliding_window_train(model, train_loader, val_loader, num_epochs=50, patience=10,
                        learning_rate=0.001, weight_decay=0.001, device=None):
    """
    Enhanced training function with improvements for scale factor issues
    and imbalanced data handling.

    Parameters:
    -----------
    model : nn.Module
        Model to train
    train_loader : DataLoader
        DataLoader for training data
    val_loader : DataLoader
        DataLoader for validation data
    num_epochs : int
        Number of epochs to train
    patience : int
        Number of epochs to wait for improvement before early stopping
    learning_rate : float
        Learning rate for optimizer
    weight_decay : float
        Weight decay for optimizer
    device : torch.device
        Device to use for training

    Returns:
    --------
    model : nn.Module
        Trained model
    dict
        Training history
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = model.to(device)

    # Optimizer with lower initial learning rate
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Scheduler with more gradual warmup
    scheduler = OneCycleLR(
        optimizer,
        max_lr=0.005,  # Lower max learning rate
        total_steps=num_epochs * len(train_loader),
        pct_start=0.15  # Longer warmup
    )

    # For early stopping
    best_val_loss = float('inf')
    best_model = None
    patience_counter = 0

    # For mixed precision training
    scaler = GradScaler()

    # Training history
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_rmse': [],
        'val_mae': [],
        'scale_factors': [],
        'transaction_rmse': []  # Track transaction-only RMSE
    }

    # Enhanced loss function with better balancing
    def enhanced_transaction_loss(y_pred, y_true, alpha=3.0, beta=1.0, zero_penalty_weight=10.0):
        """Enhanced loss function with better handling of rare transactions"""
        # Apply global scale initialization if needed (first 3 epochs)
        global_scale_applied = False

        # Scale correction - with reduced maximum to prevent extreme values
        batch_scale_factor = torch.mean(y_true) / (torch.mean(y_pred) + 1e-8)
        batch_scale_factor = torch.clamp(batch_scale_factor, 0.5, 500.0)  # Narrower range
        y_pred_scaled = y_pred * batch_scale_factor

        # Standard MSE with dynamic weighting based on target value
        base_loss = F.mse_loss(y_pred_scaled, y_true, reduction='none')

        # Find where actual transactions occurred
        nonzero_mask = (y_true > 0)
        zero_mask = ~nonzero_mask

        # Apply higher weights to non-zero targets
        weights = torch.ones_like(y_true, device=y_true.device)
        if nonzero_mask.sum() > 0:
            # Calculate proportion of zeros vs non-zeros for dynamic weighting
            zero_ratio = zero_mask.sum().float() / nonzero_mask.sum().float()
            # Cap the ratio to prevent extreme weights
            zero_ratio = torch.clamp(zero_ratio, 1.0, 100.0)
            weights[nonzero_mask] = alpha * zero_ratio

        # Weighted MSE loss
        weighted_mse = (base_loss * weights).mean()

        # Transaction specific loss
        if nonzero_mask.sum() > 0:
            # Calculate loss for transactions
            transaction_loss = F.mse_loss(y_pred_scaled[nonzero_mask], y_true[nonzero_mask])

            # Additional penalty for predicting near-zero when there are transactions
            zero_pred_mask = (y_pred_scaled[nonzero_mask] < 0.05)  # Increased threshold
            if zero_pred_mask.sum() > 0:
                zero_penalty = torch.mean((0.2 - y_pred_scaled[nonzero_mask][zero_pred_mask])**2)
                # Increased penalty for zero predictions
                zero_penalty = zero_penalty * zero_penalty_weight
            else:
                zero_penalty = torch.tensor(0.0, device=y_pred.device)
        else:
            transaction_loss = torch.tensor(0.0, device=y_pred.device)
            zero_penalty = torch.tensor(0.0, device=y_pred.device)

        # Volume consistency with higher weight
        pred_total = y_pred_scaled.sum(dim=1)
        true_total = y_true.sum(dim=1)
        volume_penalty = beta * torch.mean(torch.abs(pred_total - true_total))

        # Combined loss with improved weighting
        return weighted_mse + transaction_loss + volume_penalty + zero_penalty

    print(f"Starting enhanced training on {device}...")

    # Track persistently high scale factors
    high_scale_epochs = 0

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        epoch_scale_factors = []

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for X_batch, y_batch, customer_batch in progress_bar:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            customer_batch = customer_batch.to(device)

            # Forward pass
            optimizer.zero_grad()

            # Forward pass with autocast (mixed precision)
            with autocast():
                predictions = model(X_batch, customer_batch)

                # Calculate batch scale factor for monitoring
                batch_mean_true = torch.mean(y_batch).item()
                batch_mean_pred = torch.mean(predictions).item()
                if batch_mean_pred > 1e-10:
                    curr_scale = batch_mean_true / batch_mean_pred
                    curr_scale = min(max(curr_scale, 0.5), 500.0)
                    epoch_scale_factors.append(curr_scale)

                # Calculate loss
                loss = enhanced_transaction_loss(predictions, y_batch)

            # Backward pass with scaling
            scaler.scale(loss).backward()

            # Unscale before gradient clipping
            scaler.unscale_(optimizer)
            # Lower max norm to prevent large updates
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)

            # Update weights with scaler
            scaler.step(optimizer)
            scaler.update()

            # Update scheduler
            scheduler.step()

            train_loss += loss.item()

            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item()})

        avg_train_loss = train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)

        # Store average scale factor for this epoch
        if epoch_scale_factors:
            avg_scale_factor = np.mean(epoch_scale_factors)
            history['scale_factors'].append(avg_scale_factor)

            # Check for persistently high scale factors
            if avg_scale_factor > 100 and epoch > 5:
                high_scale_epochs += 1

                # Apply direct scaling to output layer if scale factor remains high
                if high_scale_epochs >= 3:
                    print(f"  Applying direct scale adjustment to output layer")
                    for name, param in model.named_parameters():
                        if 'output' in name and 'weight' in name:
                            # Scale up the output layer weights to increase predictions
                            param.data *= 2.0
                    high_scale_epochs = 0  # Reset counter after adjustment
            else:
                high_scale_epochs = 0  # Reset if scale factor improves

        # Validation
        model.eval()
        val_loss = 0.0
        val_pred_list = []
        val_target_list = []

        with torch.no_grad():
            for X_batch, y_batch, customer_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                customer_batch = customer_batch.to(device)

                val_preds = model(X_batch, customer_batch)

                # Apply the current average scale factor to predictions for metrics
                if epoch_scale_factors:
                    # Using a smoothed scale factor from training
                    # *** Add a check to prevent NaN or inf in scale factor ***
                    avg_scale_factor = np.nan_to_num(avg_scale_factor, nan=1.0, posinf=1.0, neginf=1.0)
                    val_preds_scaled = val_preds * min(avg_scale_factor, 100)
                else:
                    val_preds_scaled = val_preds

                # Calculate loss
                batch_loss = enhanced_transaction_loss(val_preds, y_batch).item()
                val_loss += batch_loss

                # Store predictions and targets for metrics
                val_pred_list.append(val_preds_scaled.cpu().numpy())
                val_target_list.append(y_batch.cpu().numpy())

        # Concatenate all batches
        val_predictions = np.vstack(val_pred_list)
        val_targets = np.vstack(val_target_list)

        # Calculate metrics
        val_mse = mean_squared_error(val_targets, val_predictions)
        val_rmse = np.sqrt(val_mse)
        val_mae = mean_absolute_error(val_targets, val_predictions)

        # Calculate transaction-only RMSE (for non-zero targets)
        nonzero_mask = val_targets > 0
        if nonzero_mask.sum() > 0:
            tx_mse = mean_squared_error(
                val_targets[nonzero_mask],
                val_predictions[nonzero_mask]
            )
            tx_rmse = np.sqrt(tx_mse)
        else:
            tx_rmse = 0.0

        # Average metrics
        val_loss /= len(val_loader)
        history['val_loss'].append(val_loss)
        history['val_rmse'].append(val_rmse)
        history['val_mae'].append(val_mae)
        history['transaction_rmse'].append(tx_rmse)

        # Print progress
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {val_loss:.6f}")
        print(f"  Val RMSE: {val_rmse:.6f}, Val MAE: {val_mae:.6f}, Tx RMSE: {tx_rmse:.6f}")
        if epoch_scale_factors:
            print(f"  Avg Scale Factor: {avg_scale_factor:.2f}")

        # Check for improvement with a more stable metric combination
        val_metric = val_loss + 0.1 * tx_rmse  # Combined metric that considers transaction accuracy
        if val_metric < best_val_loss:
            best_val_loss = val_metric
            best_model = {k: v.cpu().detach() for k, v in model.state_dict().items()}
            patience_counter = 0
            print(f"  New best model! Val Metric: {val_metric:.6f}")
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping after {epoch+1} epochs")
            break

    # Load best model
    model.load_state_dict(best_model)

    return model, history

## Model Estimation and Visualization

In [None]:
def evaluate_model(model, test_loader, device=None):
    """
    Evaluate the model on test data.

    Parameters:
    -----------
    model : nn.Module
        Trained model
    test_loader : DataLoader
        DataLoader for test data
    device : torch.device
        Device to use for evaluation

    Returns:
    --------
    dict
        Evaluation metrics
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = model.to(device)
    model.eval()

    test_pred_list = []
    test_target_list = []
    test_customer_list = []

    with torch.no_grad():
        for X_batch, y_batch, customer_batch in tqdm(test_loader, desc="Evaluating"):
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            customer_batch = customer_batch.to(device)

            test_preds = model(X_batch, customer_batch)

            # Store predictions, targets, and customers
            test_pred_list.append(test_preds.cpu().numpy())
            test_target_list.append(y_batch.cpu().numpy())
            test_customer_list.append(customer_batch.cpu().numpy())

    # Concatenate all batches
    test_predictions = np.vstack(test_pred_list)
    test_targets = np.vstack(test_target_list)
    test_customers = np.concatenate(test_customer_list)

    # Calculate overall metrics
    test_mse = mean_squared_error(test_targets, test_predictions)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(test_targets, test_predictions)

    # Calculate customer-specific metrics
    unique_customers = np.unique(test_customers)
    customer_metrics = {}

    for cust_id in unique_customers:
        cust_mask = test_customers == cust_id
        cust_preds = test_predictions[cust_mask]
        cust_targets = test_targets[cust_mask]

        if len(cust_preds) > 0:
            cust_mse = mean_squared_error(cust_targets, cust_preds)
            cust_rmse = np.sqrt(cust_mse)
            cust_mae = mean_absolute_error(cust_targets, cust_preds)

            customer_metrics[int(cust_id)] = {
                'rmse': cust_rmse,
                'mae': cust_mae,
                'num_sequences': len(cust_preds)
            }

    # Transaction-specific metrics (non-zero values)
    nonzero_mask = test_targets > 0
    if nonzero_mask.sum() > 0:
        trans_mse = mean_squared_error(test_targets[nonzero_mask], test_predictions[nonzero_mask])
        trans_rmse = np.sqrt(trans_mse)
        trans_mae = mean_absolute_error(test_targets[nonzero_mask], test_predictions[nonzero_mask])
    else:
        trans_rmse = 0
        trans_mae = 0

    print(f"Test RMSE: {test_rmse:.6f}, Test MAE: {test_mae:.6f}")
    print(f"Transaction-only RMSE: {trans_rmse:.6f}, MAE: {trans_mae:.6f}")

    return {
        'overall': {
            'mse': test_mse,
            'rmse': test_rmse,
            'mae': test_mae
        },
        'transaction_only': {
            'rmse': trans_rmse,
            'mae': trans_mae
        },
        'by_customer': customer_metrics,
        'predictions': test_predictions,
        'targets': test_targets,
        'customers': test_customers
    }

In [None]:
# After running the pipeline and training the model:

# 1. Visualize training history
def plot_training_history(history):
    """
    Plot training and validation loss along with performance metrics.
    """
    # Create figure with subplots
    fig, axs = plt.subplots(1, 3, figsize=(18, 5))

    # Plot training and validation loss
    axs[0].plot(history['train_loss'], label='Training Loss')
    axs[0].plot(history['val_loss'], label='Validation Loss')
    axs[0].set_xlabel('Epoch')
    axs[0].set_ylabel('Loss')
    axs[0].set_title('Training and Validation Loss')
    axs[0].legend()
    axs[0].grid(alpha=0.3)

    # Plot validation RMSE and MAE
    if 'val_rmse' in history:
        axs[1].plot(history['val_rmse'], label='RMSE')
        axs[1].plot(history['val_mae'], label='MAE')
        axs[1].set_xlabel('Epoch')
        axs[1].set_ylabel('Error')
        axs[1].set_title('Validation Metrics')
        axs[1].legend()
        axs[1].grid(alpha=0.3)

    # Plot scale factors if available
    if 'scale_factors' in history and history['scale_factors']:
        axs[2].plot(history['scale_factors'])
        axs[2].set_xlabel('Epoch')
        axs[2].set_ylabel('Scale Factor')
        axs[2].set_title('Scale Factor Evolution')
        axs[2].grid(alpha=0.3)
    else:
        axs[2].set_visible(False)

    plt.tight_layout()
    plt.show()

# 2. Visualize predictions
def visualize_predictions_fixed_timeframe(window_size, input_sequences, true_future, predictions,
                                         num_samples=5, customer_ids=None, start_idx=0):
    """
    Visualize input sequences, true future values, and predictions for selected samples
    with fixed timeframe approach, ensuring samples from different customers.
    """
    if customer_ids is None:
        # If no customer IDs provided, use default selection logic
        if start_idx + num_samples > len(input_sequences):
            start_idx = max(0, len(input_sequences) - num_samples)
            num_samples = min(num_samples, len(input_sequences) - start_idx)

        indices = list(range(start_idx, start_idx + num_samples))
    else:
        # Find unique customer IDs
        unique_customers = np.unique(customer_ids)

        # Select one sample from each unique customer, up to num_samples
        indices = []
        for customer in unique_customers[:min(len(unique_customers), num_samples)]:
            # Find indices for this customer
            customer_indices = np.where(customer_ids == customer)[0]
            if len(customer_indices) > 0:
                # Take the first window for this customer
                indices.append(customer_indices[0])

        # If we need more samples to reach num_samples, add randomly
        if len(indices) < num_samples:
            remaining = num_samples - len(indices)
            all_indices = np.arange(len(input_sequences))
            available = np.setdiff1d(all_indices, indices)
            if len(available) > 0:
                additional = np.random.choice(available,
                                             size=min(remaining, len(available)),
                                             replace=False)
                indices.extend(additional)

    # Create figure
    plt.figure(figsize=(15, 4 * len(indices)))

    for i, idx in enumerate(indices):
        plt.subplot(len(indices), 1, i+1)

        # Get data
        input_seq = input_sequences[idx].flatten()
        true_seq = true_future[idx].flatten()
        pred_seq = predictions[idx].flatten()

        # Time indices
        input_weeks = np.arange(window_size)
        forecast_weeks = np.arange(len(true_seq))

        # Plot input sequence
        plt.plot(input_weeks, input_seq, 'o-', color='blue',
                 label='Input Sequence', alpha=0.7, markersize=3)

        # Plot forecast window with offset
        plt.plot(forecast_weeks + window_size, true_seq, 'o-', color='green',
                 label='True Future', alpha=0.7, markersize=3)
        plt.plot(forecast_weeks + window_size, pred_seq, 'x--', color='red',
                 label='Prediction', alpha=0.7, markersize=4)

        # Add vertical line to separate input and forecast
        plt.axvline(x=window_size-1, color='gray', linestyle='--', alpha=0.5)

        # Add customer ID if available
        if customer_ids is not None:
            # Check if customer_to_idx and idx_to_customer are available in global scope
            if 'idx_to_customer' in globals():
                original_id = idx_to_customer.get(customer_ids[idx], f"Index {customer_ids[idx]}")
                title = f'Sample {i} (Original Customer ID: {original_id})'
            else:
                title = f'Sample {i} (Customer {customer_ids[idx]})'
        else:
            title = f'Sample {i}'

        plt.title(title)
        plt.ylabel('Transactions')
        plt.legend()
        plt.grid(True, alpha=0.3)

    plt.suptitle('Transaction Sequence Predictions', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plt.show()

# 3. Visualize aggregated results
def visualize_aggregated_results_fixed_timeframe(window_size, input_sequences, true_future, predictions):
    """
    Visualize aggregated predictions vs true values across all samples.
    Shows average input, target, and predicted sequences over time.
    """
    # Compute mean across samples
    mean_input = np.mean(input_sequences, axis=0).flatten()
    mean_true = np.mean(true_future, axis=0).flatten()
    mean_pred = np.mean(predictions, axis=0).flatten()

    # Time indices
    input_weeks = np.arange(window_size)
    forecast_weeks = np.arange(len(mean_true))

    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(input_weeks, mean_input, 'o-', label='Mean Input Sequence', color='blue')
    plt.plot(forecast_weeks + window_size, mean_true, 'o-', label='Mean True Future', color='green')
    plt.plot(forecast_weeks + window_size, mean_pred, 'x--', label='Mean Prediction', color='red')

    plt.axvline(x=window_size-1, color='gray', linestyle='--', alpha=0.5)
    plt.title('Aggregated Transaction Predictions')
    plt.xlabel('Week')
    plt.ylabel('Average Transactions')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

##Key Implementation


In [None]:
# Load and prepare data
df = pd.read_csv('15-transactions_allCohorts.csv')
df['Date'] = pd.to_datetime(df['ORDER_DATE'])

In [None]:
# Define fixed time periods
TRAINING_INPUT_PERIOD = ('2005-01-01', '2006-12-30')
TRAINING_TARGET_PERIOD = ('2007-01-02', '2009-01-02')
PREDICTION_INPUT_PERIOD = ('2007-01-01', '2009-01-01')
PREDICTION_TARGET_PERIOD = ('2009-01-02', '2011-01-01')


##10 cohort

pipeline_data = fixed_timeframe_sliding_window_pipeline(
    df,
    training_input_period=TRAINING_INPUT_PERIOD,
    training_target_period=TRAINING_TARGET_PERIOD,
    prediction_input_period=PREDICTION_INPUT_PERIOD,
    prediction_target_period=PREDICTION_TARGET_PERIOD,
    window_size=16,
    step_size=4,
    customer_field='CUSTNO',
    date_field='Date',
    cohort_field='COHORT_NUMBER',
    cohort_range=(1, 10),
    batch_size=64
)

# Use the pipeline data for model training
sequence_data = pipeline_data['sequence_data']
dataloaders = pipeline_data['dataloaders']

In [None]:
# Initialize model
model_params = {
    'input_dim': 1,
    'embed_dim': 128,
    'hidden_dim': 512,
    'num_layers': 2,
    'num_heads': 8,
    'output_dim': len(sequence_data['training_target_dates']),  # Output dimension is the target period length
    'num_customers': sequence_data['num_customers'],
    'dropout': 0.1
}

model = SlidingWindowTransformer(**model_params).to(device)

# Train the model
trained_model, history = enhanced_sliding_window_train(
    model,
    dataloaders['train'],
    dataloaders['val'],
    num_epochs=50,
    patience=10
)


# Evaluate on prediction data
pred_metrics = evaluate_model(
    trained_model,
    dataloaders['pred'],
    device=device
)

In [None]:
# After training the model and evaluating on prediction data:

# 1. Visualize training history
plot_training_history(history)

# 2. Extract prediction data for visualization
pred_data = []
pred_targets = []
pred_outputs = []
pred_customer_ids = []

# Extract a batch of data from the prediction dataloader
for inputs, targets, customer_ids in dataloaders['pred']:
    # Get model predictions
    with torch.no_grad():
        outputs = trained_model(inputs.to(device), customer_ids.to(device)).cpu()

    # Store data for visualization
    pred_data.append(inputs.cpu().numpy())
    pred_targets.append(targets.cpu().numpy())
    pred_outputs.append(outputs.numpy())
    pred_customer_ids.append(customer_ids.numpy())

    # Only process a few batches for visualization
    if len(pred_data) >= 10:
        break

# Concatenate batches
pred_data = np.vstack(pred_data)
pred_targets = np.vstack(pred_targets)
pred_outputs = np.vstack(pred_outputs)
pred_customer_ids = np.concatenate(pred_customer_ids)

# 3. Visualize individual predictions
visualize_predictions_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs,
    num_samples=5,
    customer_ids=pred_customer_ids
)

# 4. Visualize aggregated results
visualize_aggregated_results_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs
)

##Equal Length Group

From this section below, 2 other temporal settings are provided as devised in thesis.

In [None]:
# Define fixed time periods
TRAINING_INPUT_PERIOD = ('2005-01-01', '2006-12-30')
TRAINING_TARGET_PERIOD = ('2007-01-02', '2009-01-02')
PREDICTION_INPUT_PERIOD = ('2007-01-01', '2009-01-01')
PREDICTION_TARGET_PERIOD = ('2009-01-02', '2011-01-01')


##15 cohort

pipeline_data = fixed_timeframe_sliding_window_pipeline(
    df,
    training_input_period=TRAINING_INPUT_PERIOD,
    training_target_period=TRAINING_TARGET_PERIOD,
    prediction_input_period=PREDICTION_INPUT_PERIOD,
    prediction_target_period=PREDICTION_TARGET_PERIOD,
    window_size=16,
    step_size=4,
    customer_field='CUSTNO',
    date_field='Date',
    cohort_field='COHORT_NUMBER',
    cohort_range=(2, 16),
    batch_size=64
)

# Use the pipeline data for model training
sequence_data = pipeline_data['sequence_data']
dataloaders = pipeline_data['dataloaders']


In [None]:
# Initialize model
model_params = {
    'input_dim': 1,
    'embed_dim': 128,
    'hidden_dim': 512,
    'num_layers': 2,
    'num_heads': 8,
    'output_dim': len(sequence_data['training_target_dates']),  # Output dimension is the target period length
    'num_customers': sequence_data['num_customers'],
    'dropout': 0.1
}

model = SlidingWindowTransformer(**model_params).to(device)

# Train the model
trained_model, history = enhanced_sliding_window_train(
    model,
    dataloaders['train'],
    dataloaders['val'],
    num_epochs=50,
    patience=10
)


# Evaluate on prediction data
pred_metrics = evaluate_model(
    trained_model,
    dataloaders['pred'],
    device=device
)

In [None]:
# After training the model and evaluating on prediction data:

# 1. Visualize training history
plot_training_history(history)

# 2. Extract prediction data for visualization
pred_data = []
pred_targets = []
pred_outputs = []
pred_customer_ids = []

# Extract a batch of data from the prediction dataloader
for inputs, targets, customer_ids in dataloaders['pred']:
    # Get model predictions
    with torch.no_grad():
        outputs = trained_model(inputs.to(device), customer_ids.to(device)).cpu()

    # Store data for visualization
    pred_data.append(inputs.cpu().numpy())
    pred_targets.append(targets.cpu().numpy())
    pred_outputs.append(outputs.numpy())
    pred_customer_ids.append(customer_ids.numpy())

    # Only process a few batches for visualization
    if len(pred_data) >= 10:
        break

# Concatenate batches
pred_data = np.vstack(pred_data)
pred_targets = np.vstack(pred_targets)
pred_outputs = np.vstack(pred_outputs)
pred_customer_ids = np.concatenate(pred_customer_ids)

# 3. Visualize individual predictions
visualize_predictions_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs,
    num_samples=5,
    customer_ids=pred_customer_ids
)

# 4. Visualize aggregated results
visualize_aggregated_results_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs
)

In [None]:
##20 cohort

pipeline_data = fixed_timeframe_sliding_window_pipeline(
    df,
    training_input_period=TRAINING_INPUT_PERIOD,
    training_target_period=TRAINING_TARGET_PERIOD,
    prediction_input_period=PREDICTION_INPUT_PERIOD,
    prediction_target_period=PREDICTION_TARGET_PERIOD,
    window_size=16,
    step_size=4,
    customer_field='CUSTNO',
    date_field='Date',
    cohort_field='COHORT_NUMBER',
    cohort_range=(1, 20),
    batch_size=64
)

# Use the pipeline data for model training
sequence_data = pipeline_data['sequence_data']
dataloaders = pipeline_data['dataloaders']


In [None]:
# Initialize model
model_params = {
    'input_dim': 1,
    'embed_dim': 128,
    'hidden_dim': 512,
    'num_layers': 2,
    'num_heads': 8,
    'output_dim': len(sequence_data['training_target_dates']),  # Output dimension is the target period length
    'num_customers': sequence_data['num_customers'],
    'dropout': 0.1
}

model = SlidingWindowTransformer(**model_params).to(device)

# Train the model
trained_model, history = enhanced_sliding_window_train(
    model,
    dataloaders['train'],
    dataloaders['val'],
    num_epochs=50,
    patience=10
)


# Evaluate on prediction data
pred_metrics = evaluate_model(
    trained_model,
    dataloaders['pred'],
    device=device
)

In [None]:
# After training the model and evaluating on prediction data:

# 1. Visualize training history
plot_training_history(history)

# 2. Extract prediction data for visualization
pred_data = []
pred_targets = []
pred_outputs = []
pred_customer_ids = []

# Extract a batch of data from the prediction dataloader
for inputs, targets, customer_ids in dataloaders['pred']:
    # Get model predictions
    with torch.no_grad():
        outputs = trained_model(inputs.to(device), customer_ids.to(device)).cpu()

    # Store data for visualization
    pred_data.append(inputs.cpu().numpy())
    pred_targets.append(targets.cpu().numpy())
    pred_outputs.append(outputs.numpy())
    pred_customer_ids.append(customer_ids.numpy())

    # Only process a few batches for visualization
    if len(pred_data) >= 10:
        break

# Concatenate batches
pred_data = np.vstack(pred_data)
pred_targets = np.vstack(pred_targets)
pred_outputs = np.vstack(pred_outputs)
pred_customer_ids = np.concatenate(pred_customer_ids)

# 3. Visualize individual predictions
visualize_predictions_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs,
    num_samples=5,
    customer_ids=pred_customer_ids
)

# 4. Visualize aggregated results
visualize_aggregated_results_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs
)

##Long Output Group

In [None]:
# Define fixed time periods
TRAINING_INPUT_PERIOD = ('2005-01-01', '2006-12-30')
TRAINING_TARGET_PERIOD = ('2007-01-02', '2009-12-31')
PREDICTION_INPUT_PERIOD = ('2006-01-01', '2008-01-01')
PREDICTION_TARGET_PERIOD = ('2008-01-02', '2011-01-01')

In [None]:
# prompt: calculate how many weeks for each period

# Calculate the number of weeks for each period
def calculate_weeks(start_date_str, end_date_str):
    start_date = datetime.datetime.strptime(start_date_str, '%Y-%m-%d').date()
    end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d').date()
    delta = end_date - start_date
    weeks = delta.days // 7
    return weeks

# Example usage with your defined periods:
training_input_weeks = calculate_weeks(TRAINING_INPUT_PERIOD[0], TRAINING_INPUT_PERIOD[1])
training_target_weeks = calculate_weeks(TRAINING_TARGET_PERIOD[0], TRAINING_TARGET_PERIOD[1])
prediction_input_weeks = calculate_weeks(PREDICTION_INPUT_PERIOD[0], PREDICTION_INPUT_PERIOD[1])
prediction_target_weeks = calculate_weeks(PREDICTION_TARGET_PERIOD[0], PREDICTION_TARGET_PERIOD[1])

print(f"Training Input Weeks: {training_input_weeks}")
print(f"Training Target Weeks: {training_target_weeks}")
print(f"Prediction Input Weeks: {prediction_input_weeks}")
print(f"Prediction Target Weeks: {prediction_target_weeks}")


In [None]:
#cohort 10

# Run the pipeline
pipeline_data = fixed_timeframe_sliding_window_pipeline(
    df,
    training_input_period=TRAINING_INPUT_PERIOD,
    training_target_period=TRAINING_TARGET_PERIOD,
    prediction_input_period=PREDICTION_INPUT_PERIOD,
    prediction_target_period=PREDICTION_TARGET_PERIOD,
    window_size=16,
    step_size=4,
    customer_field='CUSTNO',
    date_field='Date',
    cohort_field='COHORT_NUMBER',
    cohort_range=(1, 10),
    batch_size=64
)

# Use the pipeline data for model training
sequence_data = pipeline_data['sequence_data']
dataloaders = pipeline_data['dataloaders']


In [None]:
# Initialize model
model_params = {
    'input_dim': 1,
    'embed_dim': 128,
    'hidden_dim': 512,
    'num_layers': 2,
    'num_heads': 8,
    'output_dim': len(sequence_data['training_target_dates']),  # Output dimension is the target period length
    'num_customers': sequence_data['num_customers'],
    'dropout': 0.1
}

model = SlidingWindowTransformer(**model_params).to(device)

# Train the model
trained_model, history = enhanced_sliding_window_train(
    model,
    dataloaders['train'],
    dataloaders['val'],
    num_epochs=50,
    patience=10
)


# Evaluate on prediction data
pred_metrics = evaluate_model(
    trained_model,
    dataloaders['pred'],
    device=device
)

In [None]:
# After training the model and evaluating on prediction data:

# 1. Visualize training history
plot_training_history(history)

# 2. Extract prediction data for visualization
pred_data = []
pred_targets = []
pred_outputs = []
pred_customer_ids = []

# Extract a batch of data from the prediction dataloader
for inputs, targets, customer_ids in dataloaders['pred']:
    # Get model predictions
    with torch.no_grad():
        outputs = trained_model(inputs.to(device), customer_ids.to(device)).cpu()

    # Store data for visualization
    pred_data.append(inputs.cpu().numpy())
    pred_targets.append(targets.cpu().numpy())
    pred_outputs.append(outputs.numpy())
    pred_customer_ids.append(customer_ids.numpy())

    # Only process a few batches for visualization
    if len(pred_data) >= 10:
        break

# Concatenate batches
pred_data = np.vstack(pred_data)
pred_targets = np.vstack(pred_targets)
pred_outputs = np.vstack(pred_outputs)
pred_customer_ids = np.concatenate(pred_customer_ids)

# 3. Visualize individual predictions
visualize_predictions_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs,
    num_samples=5,
    customer_ids=pred_customer_ids
)

# 4. Visualize aggregated results
visualize_aggregated_results_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs
)

In [None]:
#cohort 15

# Run the pipeline
pipeline_data = fixed_timeframe_sliding_window_pipeline(
    df,
    training_input_period=TRAINING_INPUT_PERIOD,
    training_target_period=TRAINING_TARGET_PERIOD,
    prediction_input_period=PREDICTION_INPUT_PERIOD,
    prediction_target_period=PREDICTION_TARGET_PERIOD,
    window_size=16,
    step_size=4,
    customer_field='CUSTNO',
    date_field='Date',
    cohort_field='COHORT_NUMBER',
    cohort_range=(1, 15),
    batch_size=64
)

# Use the pipeline data for model training
sequence_data = pipeline_data['sequence_data']
dataloaders = pipeline_data['dataloaders']


In [None]:
# Initialize model
model_params = {
    'input_dim': 1,
    'embed_dim': 128,
    'hidden_dim': 512,
    'num_layers': 2,
    'num_heads': 8,
    'output_dim': len(sequence_data['training_target_dates']),  # Output dimension is the target period length
    'num_customers': sequence_data['num_customers'],
    'dropout': 0.1
}

model = SlidingWindowTransformer(**model_params).to(device)

# Train the model
trained_model, history = enhanced_sliding_window_train(
    model,
    dataloaders['train'],
    dataloaders['val'],
    num_epochs=50,
    patience=10
)


# Evaluate on prediction data
pred_metrics = evaluate_model(
    trained_model,
    dataloaders['pred'],
    device=device
)

In [None]:
# After training the model and evaluating on prediction data:

# 1. Visualize training history
plot_training_history(history)

# 2. Extract prediction data for visualization
pred_data = []
pred_targets = []
pred_outputs = []
pred_customer_ids = []

# Extract a batch of data from the prediction dataloader
for inputs, targets, customer_ids in dataloaders['pred']:
    # Get model predictions
    with torch.no_grad():
        outputs = trained_model(inputs.to(device), customer_ids.to(device)).cpu()

    # Store data for visualization
    pred_data.append(inputs.cpu().numpy())
    pred_targets.append(targets.cpu().numpy())
    pred_outputs.append(outputs.numpy())
    pred_customer_ids.append(customer_ids.numpy())

    # Only process a few batches for visualization
    if len(pred_data) >= 10:
        break

# Concatenate batches
pred_data = np.vstack(pred_data)
pred_targets = np.vstack(pred_targets)
pred_outputs = np.vstack(pred_outputs)
pred_customer_ids = np.concatenate(pred_customer_ids)

# 3. Visualize individual predictions
visualize_predictions_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs,
    num_samples=5,
    customer_ids=pred_customer_ids
)

# 4. Visualize aggregated results
visualize_aggregated_results_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs
)

In [None]:
#cohort 20

# Run the pipeline
pipeline_data = fixed_timeframe_sliding_window_pipeline(
    df,
    training_input_period=TRAINING_INPUT_PERIOD,
    training_target_period=TRAINING_TARGET_PERIOD,
    prediction_input_period=PREDICTION_INPUT_PERIOD,
    prediction_target_period=PREDICTION_TARGET_PERIOD,
    window_size=16,
    step_size=4,
    customer_field='CUSTNO',
    date_field='Date',
    cohort_field='COHORT_NUMBER',
    cohort_range=(1, 20),
    batch_size=64
)

# Use the pipeline data for model training
sequence_data = pipeline_data['sequence_data']
dataloaders = pipeline_data['dataloaders']


In [None]:
# Initialize model
model_params = {
    'input_dim': 1,
    'embed_dim': 128,
    'hidden_dim': 512,
    'num_layers': 2,
    'num_heads': 8,
    'output_dim': len(sequence_data['training_target_dates']),  # Output dimension is the target period length
    'num_customers': sequence_data['num_customers'],
    'dropout': 0.1
}

model = SlidingWindowTransformer(**model_params).to(device)

# Train the model
trained_model, history = enhanced_sliding_window_train(
    model,
    dataloaders['train'],
    dataloaders['val'],
    num_epochs=50,
    patience=10
)


# Evaluate on prediction data
pred_metrics = evaluate_model(
    trained_model,
    dataloaders['pred'],
    device=device
)

In [None]:
# After training the model and evaluating on prediction data:

# 1. Visualize training history
plot_training_history(history)

# 2. Extract prediction data for visualization
pred_data = []
pred_targets = []
pred_outputs = []
pred_customer_ids = []

# Extract a batch of data from the prediction dataloader
for inputs, targets, customer_ids in dataloaders['pred']:
    # Get model predictions
    with torch.no_grad():
        outputs = trained_model(inputs.to(device), customer_ids.to(device)).cpu()

    # Store data for visualization
    pred_data.append(inputs.cpu().numpy())
    pred_targets.append(targets.cpu().numpy())
    pred_outputs.append(outputs.numpy())
    pred_customer_ids.append(customer_ids.numpy())

    # Only process a few batches for visualization
    if len(pred_data) >= 10:
        break

# Concatenate batches
pred_data = np.vstack(pred_data)
pred_targets = np.vstack(pred_targets)
pred_outputs = np.vstack(pred_outputs)
pred_customer_ids = np.concatenate(pred_customer_ids)

# 3. Visualize individual predictions
visualize_predictions_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs,
    num_samples=5,
    customer_ids=pred_customer_ids
)

# 4. Visualize aggregated results
visualize_aggregated_results_fixed_timeframe(
    window_size=pipeline_data['config']['window_size'],
    input_sequences=pred_data,
    true_future=pred_targets,
    predictions=pred_outputs
)