# Improving Stability: Gradient Clipping and Learning Rate Scheduling


## Step 1: Initial Setup


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Configuration for our dummy data
num_samples = 100  # Total number of data points
num_features = 784  # Represents 28x28 pixels for an image
num_classes = 10   # Number of possible output categories (e.g., digits 0-9)
batch_size = 10    # Number of samples processed in one go

# Generate random input data (features)
dummy_inputs = torch.randn(num_samples, num_features)
# Generate random integer labels for classification
dummy_labels = torch.randint(0, num_classes, (num_samples,))

# Create a TensorDataset from our inputs and labels
dummy_dataset = TensorDataset(dummy_inputs, dummy_labels)

# Set up DataLoaders:
# train_loader shuffles the data each epoch, val_loader does not.
train_loader = DataLoader(dummy_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dummy_dataset, batch_size=batch_size)

In [None]:
model = nn.Sequential(
    nn.Linear(num_features, 128), # Input layer (784 features) to a hidden layer of 128 neurons
    nn.ReLU(),                   # ReLU activation function for non-linearity
    nn.Linear(128, 64),          # First hidden layer to a second hidden layer of 64 neurons
    nn.ReLU(),                   # Another ReLU activation
    nn.Linear(64, num_classes)   # Second hidden layer to the output layer (10 neurons for 10 classes)
)

## Step 2: Gradient Clipping in Action


In [None]:
# Optimizer: Adam is a popular choice for its adaptive learning rates.
optimizer = optim.Adam(model.parameters(), lr=0.02)

# Loss Function: CrossEntropyLoss is standard for multi-class classification problems.
criterion = nn.CrossEntropyLoss()

def training_loop_with_clipping(loader, model_instance, criterion_func, optimizer_instance, max_clip_norm=1.0):
    """
    Performs one epoch of training, including gradient clipping.
    Args:
        loader (DataLoader): The data loader for the training set.
        model_instance (nn.Module): The neural network model.
        criterion_func (callable): The loss function.
        optimizer_instance (optim.Optimizer): The optimizer.
        max_clip_norm (float): The maximum norm for gradient clipping.
    Returns:
        float: The average training loss for the epoch.
    """
    model_instance.train() # Set the model to training mode
    total_loss = 0
    for inputs, labels in loader:
        optimizer_instance.zero_grad() # Clear previous gradients
        outputs = model_instance(inputs) # Forward pass
        loss = criterion_func(outputs, labels) # Calculate loss
        loss.backward() # Backpropagation: compute gradients

        # Gradient Clipping: prevents exploding gradients by capping their magnitude
        torch.nn.utils.clip_grad_norm_(model_instance.parameters(), max_norm=max_clip_norm)

        optimizer_instance.step() # Update model parameters
        total_loss += loss.item()
    return total_loss / len(loader) if len(loader) > 0 else 0.0


## Step 3: Adapting Learning Rates with Schedulers


In [None]:
def calculate_validation_metrics(loader, model_instance, criterion_func):
    """
    Calculates the average loss and accuracy over a dataset (e.g., validation set).
    Args:
        loader (DataLoader): The data loader for the dataset.
        model_instance (nn.Module): The neural network model.
        criterion_func (callable): The loss function.
    Returns:
        tuple: (average_loss, accuracy_percentage)
    """
    model_instance.eval() # Set the model to evaluation mode (disables dropout, BatchNorm updates, etc.)
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad(): # Disable gradient calculations to save memory and speed up
        for inputs_val, labels_val in loader:
            outputs_val = model_instance(inputs_val)
            loss_val = criterion_func(outputs_val, labels_val)
            total_loss += loss_val.item()

            _, predicted = torch.max(outputs_val.data, 1) # Get the index of the max log-probability
            total_samples += labels_val.size(0)
            correct_predictions += (predicted == labels_val).sum().item()

    model_instance.train() # Set the model back to training mode
    avg_loss = total_loss / len(loader) if len(loader) > 0 else 0.0
    accuracy = (correct_predictions / total_samples) * 100 if total_samples > 0 else 0.0
    return avg_loss, accuracy

## Step 4: Integrating Techniques for Better Training


In [None]:
print("--- Demonstrating Gradient Clipping in a Single Epoch ---")
# Run one epoch to see initial behavior with clipping
initial_train_loss = training_loop_with_clipping(train_loader, model, criterion, optimizer, max_clip_norm=1.0)
print(f"One epoch training with clipping complete. Initial Training Loss: {initial_train_loss:.4f}\n")

# Learning Rate Scheduler:
# StepLR decays the learning rate by 'gamma' every 'step_size' epochs.
# We've adjusted step_size and gamma to make the LR changes more visible in a shorter demonstration.
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) # Decay every 5 epochs by 50%

print("--- Full Training Run with Gradient Clipping and Learning Rate Scheduling ---")
num_epochs_full_run = 5 # Total epochs for this demonstration

for epoch in range(num_epochs_full_run):
    # Train for one epoch (clipping is handled inside this function)
    train_loss = training_loop_with_clipping(train_loader, model, criterion, optimizer, max_clip_norm=1.0)

    # Get the current learning rate *before* the scheduler steps
    current_lr = scheduler.get_last_lr()[0]

    # Update the learning rate for the next epoch
    scheduler.step()

    # Calculate validation metrics
    val_loss, val_accuracy = calculate_validation_metrics(val_loader, model, criterion)

    # Print epoch results
    print(f"Epoch {epoch+1:2d}: LR: {current_lr:.6f}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

print("\n--- Training Complete ---")

# Report final validation metrics after the training run
final_val_loss, final_val_accuracy = calculate_validation_metrics(val_loader, model, criterion)
print(f"Final Validation Loss: {final_val_loss:.4f}, Final Validation Accuracy: {final_val_accuracy:.2f}%")

--- Demonstrating Gradient Clipping in a Single Epoch ---
One epoch training with clipping complete. Initial Training Loss: 2.4618

--- Full Training Run with Gradient Clipping and Learning Rate Scheduling ---
Epoch  1: LR: 0.020000, Train Loss: 1.9440, Val Loss: 0.6252, Val Acc: 79.00%
Epoch  2: LR: 0.020000, Train Loss: 0.4794, Val Loss: 0.0837, Val Acc: 98.00%
Epoch  3: LR: 0.020000, Train Loss: 0.1606, Val Loss: 0.1785, Val Acc: 96.00%
Epoch  4: LR: 0.020000, Train Loss: 0.5755, Val Loss: 0.0063, Val Acc: 100.00%
Epoch  5: LR: 0.020000, Train Loss: 0.2132, Val Loss: 0.0073, Val Acc: 100.00%

--- Training Complete ---
Final Validation Loss: 0.0073, Final Validation Accuracy: 100.00%
