# Lecture 12: Efficient Training

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gaurav-redhat/transformer_problems/blob/efficientml-course/efficientml_course/12_efficient_training/demo.ipynb)

Mixed precision, gradient checkpointing, and memory-efficient training.


In [None]:
!pip install torch -q
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint

# Gradient Checkpointing Demo
class BigBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.fc1 = nn.Linear(dim, dim * 4)
        self.fc2 = nn.Linear(dim * 4, dim)
    
    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

# Model with many layers
class Model(nn.Module):
    def __init__(self, use_checkpoint=False):
        super().__init__()
        self.blocks = nn.ModuleList([BigBlock(512) for _ in range(10)])
        self.use_checkpoint = use_checkpoint
    
    def forward(self, x):
        for block in self.blocks:
            if self.use_checkpoint:
                x = checkpoint(block, x, use_reentrant=False)
            else:
                x = block(x)
        return x

# Compare memory usage
x = torch.randn(32, 100, 512, requires_grad=True)

# Without checkpointing
model_normal = Model(use_checkpoint=False)
torch.cuda.reset_peak_memory_stats() if torch.cuda.is_available() else None
out = model_normal(x)
out.sum().backward()

# With checkpointing  
model_ckpt = Model(use_checkpoint=True)
out = model_ckpt(x)
out.sum().backward()

print("Gradient Checkpointing:")
print("  Without: Stores ALL intermediate activations")
print("  With: Recomputes activations during backward")
print("\n  Trade-off: ~30% more compute, ~50% less memory")
print("\nðŸŽ¯ Essential for training large models on limited GPU!")
