# Training Faster: Mixed Precision with torch.cuda.amp

## Step 1: Setting Up the PyTorch Environment for Mixed Precision


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
import time

# Configuration
NUM_EPOCHS = 5
BATCH_SIZE = 64 # For dummy data
NUM_CLASSES = 10 # Example number of classes
INPUT_CHANNELS = 3
IMAGE_SIZE = 32 # Example image size (height and width)
NUM_SAMPLES = BATCH_SIZE * 10 # Number of dummy samples

# Check for CUDA availability and set device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("CUDA not available. Training on CPU. Mixed precision (AMP) is a CUDA feature and will be disabled.")

# AMP is only available on CUDA
# This flag will control whether AMP is used.
# For the demonstration, we'll run once with AMP and once without.
# use_amp_globally = torch.cuda.is_available() # Global flag, will be set per run in evaluation

CUDA is available. Training on GPU.


## Step 2: Defining the Model and Preparing Data


In [None]:
# Define a simple CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10, input_channels=3, image_size=32):
        super(SimpleCNN, self).__init__()

        # Convolutional layer
        self.conv1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Calculate flattened feature size after conv and pool
        pooled_size = image_size // 2  # Pooling halves the spatial dimensions
        self.fc1_input_features = 64 * pooled_size * pooled_size
        self.fc1 = nn.Linear(self.fc1_input_features, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(-1, self.fc1_input_features)  # Flatten tensor
        x = self.fc1(x)
        return x

# Create dummy data and DataLoader
dummy_images = torch.randn(NUM_SAMPLES, INPUT_CHANNELS, IMAGE_SIZE, IMAGE_SIZE)  # (N, C, H, W)
dummy_labels = torch.randint(0, NUM_CLASSES, (NUM_SAMPLES,))
train_dataset = TensorDataset(dummy_images, dummy_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Step 3: Training Loop with and without Mixed Precision


In [None]:
def train_model(model, train_loader, optimizer, criterion, device, num_epochs, use_amp):
    """
    A generic training function.
    If use_amp is True and device is CUDA, mixed precision will be used.
    """
    model.train()
    model.to(device)

    # Initialize GradScaler only if AMP is used and on CUDA
    # Pass enabled=False to effectively disable scaler if use_amp is False or not on CUDA
    scaler = GradScaler(enabled=use_amp and device.type == 'cuda')

    epoch_times = []

    print(f"\nTraining with AMP: {'Enabled' if use_amp and device.type == 'cuda' else 'Disabled'}")

    for epoch in range(num_epochs):
        start_time_epoch = time.time()
        total_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            # Autocast context manager
            # enabled=False effectively makes autocast a no-op
            with autocast(enabled=use_amp and device.type == 'cuda'):
                outputs = model(images)
                loss = criterion(outputs, labels)

            if use_amp and device.type == 'cuda':
                # Scale the gradients and perform backward pass
                scaler.scale(loss).backward()
                # Unscale gradients and step optimizer
                scaler.step(optimizer)
                # Update the scale for next iteration
                scaler.update()
            else:
                # Standard backward pass and optimizer step if not using AMP
                loss.backward()
                optimizer.step()

            total_loss += loss.item()

        end_time_epoch = time.time()
        epoch_duration = end_time_epoch - start_time_epoch
        epoch_times.append(epoch_duration)
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Duration: {epoch_duration:.2f}s")

    total_training_time = sum(epoch_times)
    print(f"Total training time: {total_training_time:.2f}s")
    if len(epoch_times) > 1: # Print average excluding potential first epoch overhead
        avg_epoch_time = sum(epoch_times[1:]) / len(epoch_times[1:]) if len(epoch_times) > 1 else epoch_times[0]
        print(f"Average epoch time (excluding first): {avg_epoch_time:.2f}s")
    elif epoch_times:
        print(f"Average epoch time: {epoch_times[0]:.2f}s")

    return total_training_time

# --- Training without Mixed Precision ---
print("--- Simulating training WITHOUT mixed precision ---")
model_no_amp = SimpleCNN(num_classes=NUM_CLASSES, input_channels=INPUT_CHANNELS, image_size=IMAGE_SIZE)
optimizer_no_amp = optim.Adam(model_no_amp.parameters(), lr=0.001)
criterion_no_amp = nn.CrossEntropyLoss()

# Ensure use_amp is False for this run if on CUDA, otherwise it's already effectively disabled
time_without_mixed_precision = train_model(
    model_no_amp, train_loader, optimizer_no_amp, criterion_no_amp,
    device, NUM_EPOCHS, use_amp=False
)

# --- Training with Mixed Precision (if CUDA is available) ---
if device.type == 'cuda':
    print("\n--- Simulating training WITH mixed precision ---")
    model_amp = SimpleCNN(num_classes=NUM_CLASSES, input_channels=INPUT_CHANNELS, image_size=IMAGE_SIZE)
    optimizer_amp = optim.Adam(model_amp.parameters(), lr=0.001)
    criterion_amp = nn.CrossEntropyLoss()

    time_with_mixed_precision = train_model(
        model_amp, train_loader, optimizer_amp, criterion_amp,
        device, NUM_EPOCHS, use_amp=True
    )
else:
    print("\nSkipping mixed precision training simulation as CUDA is not available.")
    time_with_mixed_precision = float('inf') # Indicate AMP wasn't run

--- Simulating training WITHOUT mixed precision ---


  scaler = GradScaler(enabled=use_amp and device.type == 'cuda')
  with autocast(enabled=use_amp and device.type == 'cuda'):



Training with AMP: Disabled
Epoch [1/8], Loss: 10.2376, Duration: 1.24s
Epoch [2/8], Loss: 7.1103, Duration: 0.04s
Epoch [3/8], Loss: 4.4700, Duration: 0.04s
Epoch [4/8], Loss: 2.5604, Duration: 0.04s
Epoch [5/8], Loss: 1.8413, Duration: 0.04s
Epoch [6/8], Loss: 1.5037, Duration: 0.04s
Epoch [7/8], Loss: 1.2639, Duration: 0.04s
Epoch [8/8], Loss: 1.0519, Duration: 0.04s
Total training time: 1.51s
Average epoch time (excluding first): 0.04s

--- Simulating training WITH mixed precision ---

Training with AMP: Enabled
Epoch [1/8], Loss: 8.6714, Duration: 0.41s
Epoch [2/8], Loss: 5.5980, Duration: 0.04s
Epoch [3/8], Loss: 3.0717, Duration: 0.04s
Epoch [4/8], Loss: 2.0788, Duration: 0.04s
Epoch [5/8], Loss: 1.5985, Duration: 0.04s
Epoch [6/8], Loss: 1.3114, Duration: 0.04s
Epoch [7/8], Loss: 1.0572, Duration: 0.04s
Epoch [8/8], Loss: 0.8335, Duration: 0.04s
Total training time: 0.70s
Average epoch time (excluding first): 0.04s


## Step 4: Evaluating Results with Mixed Precision


In [None]:
print("\n--- Evaluation Summary ---")
if device.type == 'cuda':
    print(f"Time taken without mixed precision: {time_without_mixed_precision:.2f} seconds")
    print(f"Time taken with mixed precision: {time_with_mixed_precision:.2f} seconds")
    if time_with_mixed_precision < time_without_mixed_precision:
        speedup = (time_without_mixed_precision / time_with_mixed_precision)
        print(f"Mixed precision training was {speedup:.2f}x faster.")
        print("Training completed more efficiently with mixed precision (if GPU supports it well and model/batch size are large enough).")
    elif time_with_mixed_precision > time_without_mixed_precision :
        print("Mixed precision training was slower. This can happen for very small models/batches or certain GPU architectures.")
    else:
        print("No significant speed difference observed.")
else:
    print(f"Time taken (on CPU, no mixed precision): {time_without_mixed_precision:.2f} seconds")
    print("Mixed precision is a CUDA feature and was not run.")

print("\nNote: Actual speedup from mixed precision depends on the GPU architecture, model complexity, and batch size.")
print("For small models or small batches like in this demo, the overhead of AMP might sometimes outweigh benefits,")
print("or the difference might not be substantial. Benefits are more pronounced in larger, more complex scenarios.")


--- Evaluation Summary ---
Time taken without mixed precision: 1.51 seconds
Time taken with mixed precision: 0.70 seconds
Mixed precision training was 2.17x faster.
Training completed more efficiently with mixed precision (if GPU supports it well and model/batch size are large enough).

Note: Actual speedup from mixed precision depends on the GPU architecture, model complexity, and batch size.
For small models or small batches like in this demo, the overhead of AMP might sometimes outweigh benefits,
or the difference might not be substantial. Benefits are more pronounced in larger, more complex scenarios.
