In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as transforms

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check CUDA availability
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.9.1+cpu
CUDA available: False
CUDA version: None


# PyTorch: Complete Comprehensive Guide

## Table of Contents
1. [Introduction to PyTorch](#1-introduction-to-pytorch)
2. [Core Concepts: Tensors](#2-core-concepts-tensors)
3. [Autograd: Automatic Differentiation](#3-autograd-automatic-differentiation)
4. [Neural Networks with torch.nn](#4-neural-networks-with-torchnn)
5. [Optimization](#5-optimization)
6. [Data Loading and Processing](#6-data-loading-and-processing)
7. [Training Pipeline](#7-training-pipeline)
8. [Advanced Features](#8-advanced-features)
9. [PyTorch Variants and Ecosystem](#9-pytorch-variants-and-ecosystem)
10. [Best Practices and Tips](#10-best-practices-and-tips)

---

## 1. Introduction to PyTorch

### What is PyTorch?

PyTorch is an open-source machine learning framework developed by Meta AI (Facebook). It provides:
- Dynamic computational graphs (define-by-run)
- Automatic differentiation
- GPU acceleration
- Pythonic interface
- Rich ecosystem of tools

### Key Advantages

1. **Dynamic Computation Graphs**: Build and modify networks on the fly
2. **Eager Execution**: Immediate operation execution (easier debugging)
3. **Pythonic**: Feels like native Python, easy to learn
4. **Strong Community**: Extensive resources and libraries
5. **Research-Friendly**: Rapid prototyping and experimentation

### Installation

```bash
# CPU only
pip install torch torchvision torchaudio

# CUDA 11.8
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# CUDA 12.1
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# For Apple Silicon (M1/M2)
pip install torch torchvision torchaudio
```


## 2. Core Concepts: Tensors

### What are Tensors?

Tensors are multi-dimensional arrays, similar to NumPy arrays but with GPU support and automatic differentiation.

### Tensor Creation

```python
# From Python lists
tensor_from_list = torch.tensor([1, 2, 3, 4])
tensor_2d = torch.tensor([[1, 2], [3, 4]])

# Zeros and ones
zeros = torch.zeros(3, 4)  # 3x4 tensor of zeros
ones = torch.ones(2, 3, 4)  # 2x3x4 tensor of ones

# Random tensors
rand_tensor = torch.rand(3, 4)  # Uniform [0, 1)
randn_tensor = torch.randn(3, 4)  # Normal distribution N(0,1)

# Range tensors
arange_tensor = torch.arange(0, 10, 2)  # [0, 2, 4, 6, 8]
linspace_tensor = torch.linspace(0, 10, 5)  # 5 points from 0 to 10

# Identity matrix
identity = torch.eye(4)

# Like operations (same shape as another tensor)
x = torch.rand(3, 4)
zeros_like = torch.zeros_like(x)
ones_like = torch.ones_like(x)

# Full tensors
full_tensor = torch.full((3, 4), 7)  # Fill with value 7
```

### Tensor Attributes

```python
tensor = torch.rand(3, 4, 5)

print(f"Shape: {tensor.shape}")  # torch.Size([3, 4, 5])
print(f"Size: {tensor.size()}")  # Same as shape
print(f"Dtype: {tensor.dtype}")  # torch.float32
print(f"Device: {tensor.device}")  # cpu or cuda:0
print(f"Number of dimensions: {tensor.ndim}")  # 3
print(f"Total elements: {tensor.numel()}")  # 60
```

### Data Types

```python
# Common data types
float_tensor = torch.tensor([1, 2, 3], dtype=torch.float32)  # float32
double_tensor = torch.tensor([1, 2, 3], dtype=torch.float64)  # float64
int_tensor = torch.tensor([1, 2, 3], dtype=torch.int32)  # int32
long_tensor = torch.tensor([1, 2, 3], dtype=torch.long)  # int64
bool_tensor = torch.tensor([True, False], dtype=torch.bool)

# Type conversion
x = torch.tensor([1, 2, 3])
x_float = x.float()  # to float32
x_double = x.double()  # to float64
x_int = x_float.int()  # to int32
x_long = x.long()  # to int64
```

### Tensor Operations

#### Basic Arithmetic

```python
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])

# Element-wise operations
c = a + b  # [5, 7, 9]
c = a - b  # [-3, -3, -3]
c = a * b  # [4, 10, 18]
c = a / b  # [0.25, 0.4, 0.5]

# In-place operations (modify tensor in place)
a.add_(b)  # a = a + b
a.mul_(2)  # a = a * 2
a.sub_(1)  # a = a - 1

# Power and exponentials
c = torch.pow(a, 2)  # a^2
c = torch.exp(a)  # e^a
c = torch.log(a)  # ln(a)
c = torch.sqrt(a)  # sqrt(a)
```

#### Matrix Operations

```python
# Matrix multiplication
A = torch.rand(3, 4)
B = torch.rand(4, 5)
C = torch.matmul(A, B)  # or A @ B, shape: (3, 5)

# Batch matrix multiplication
batch_A = torch.rand(10, 3, 4)  # 10 matrices of size 3x4
batch_B = torch.rand(10, 4, 5)  # 10 matrices of size 4x5
batch_C = torch.bmm(batch_A, batch_B)  # shape: (10, 3, 5)

# Transpose
A_T = A.t()  # 2D transpose
A_T = A.transpose(0, 1)  # Transpose dimensions 0 and 1

# Permute (multi-dimensional transpose)
x = torch.rand(2, 3, 4, 5)
x_perm = x.permute(0, 3, 1, 2)  # shape: (2, 5, 3, 4)
```

#### Reshaping Operations

```python
x = torch.rand(2, 3, 4)

# View (must be contiguous in memory)
x_view = x.view(6, 4)  # shape: (6, 4)
x_view = x.view(-1, 4)  # -1 infers the dimension

# Reshape (works with non-contiguous tensors)
x_reshape = x.reshape(2, 12)

# Squeeze and unsqueeze
x = torch.rand(1, 3, 1, 4)
x_squeezed = x.squeeze()  # Remove all dimensions of size 1: (3, 4)
x_squeezed_dim = x.squeeze(0)  # Remove dimension 0: (3, 1, 4)

x = torch.rand(3, 4)
x_unsqueezed = x.unsqueeze(0)  # Add dimension at position 0: (1, 3, 4)
x_unsqueezed = x.unsqueeze(-1)  # Add at last position: (3, 4, 1)

# Flatten
x = torch.rand(2, 3, 4)
x_flat = x.flatten()  # shape: (24,)
x_flat = x.flatten(start_dim=1)  # shape: (2, 12)
```

#### Concatenation and Stacking

```python
a = torch.rand(2, 3)
b = torch.rand(2, 3)
c = torch.rand(2, 3)

# Concatenate along existing dimension
concat_0 = torch.cat([a, b, c], dim=0)  # shape: (6, 3)
concat_1 = torch.cat([a, b, c], dim=1)  # shape: (2, 9)

# Stack (creates new dimension)
stack_0 = torch.stack([a, b, c], dim=0)  # shape: (3, 2, 3)
stack_1 = torch.stack([a, b, c], dim=1)  # shape: (2, 3, 3)
```

#### Indexing and Slicing

```python
x = torch.rand(4, 5, 6)

# Basic indexing
element = x[0, 1, 2]  # Single element
row = x[0]  # First matrix: shape (5, 6)
column = x[:, :, 0]  # First column of all matrices: shape (4, 5)

# Slicing
slice_x = x[1:3, :, 2:5]  # shape: (2, 5, 3)

# Boolean indexing
mask = x > 0.5
selected = x[mask]  # All elements > 0.5

# Advanced indexing
indices = torch.tensor([0, 2, 3])
selected_rows = x[indices]  # Select rows 0, 2, 3
```

#### Reduction Operations

```python
x = torch.rand(3, 4, 5)

# Sum
total_sum = x.sum()  # Sum all elements
sum_dim0 = x.sum(dim=0)  # Sum over dimension 0: shape (4, 5)
sum_keepdim = x.sum(dim=1, keepdim=True)  # shape: (3, 1, 5)

# Mean
mean = x.mean()
mean_dim = x.mean(dim=2)

# Max and Min
max_val = x.max()
max_dim, max_indices = x.max(dim=1)  # Returns values and indices

min_val = x.min()
min_dim, min_indices = x.min(dim=1)

# Standard deviation and variance
std = x.std()
var = x.var()

# Argmax and Argmin
argmax = x.argmax()  # Index of maximum element
argmax_dim = x.argmax(dim=0)  # Indices along dimension 0
```

### GPU Operations

```python
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device('cuda:0')  # GPU 0
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")

# Move tensors to GPU
x_cpu = torch.rand(3, 4)
x_gpu = x_cpu.to(device)  # Copy to GPU
# or
x_gpu = x_cpu.cuda()

# Move back to CPU
x_back = x_gpu.cpu()

# Create directly on GPU
x_gpu = torch.rand(3, 4, device=device)

# Multiple GPUs
if torch.cuda.device_count() > 1:
    x_gpu1 = torch.rand(3, 4, device='cuda:1')
```

---

## 3. Autograd: Automatic Differentiation

### What is Autograd?

Autograd is PyTorch's automatic differentiation engine. It records operations on tensors and computes gradients automatically using backpropagation.

### Basic Gradient Computation

```python
# Enable gradient tracking
x = torch.tensor([2.0, 3.0], requires_grad=True)
print(f"Requires grad: {x.requires_grad}")

# Perform operations
y = x ** 2  # y = x^2
z = y.sum()  # z = sum(y)

print(f"y requires grad: {y.requires_grad}")  # True (inherited)
print(f"z requires grad: {z.requires_grad}")  # True

# Compute gradients
z.backward()  # Computes dz/dx

print(f"Gradient: {x.grad}")  # [4.0, 6.0] since dz/dx = 2x
```

### Gradient Flow

```python
# Example: f(x) = (x^2 + 3x)^2
x = torch.tensor([2.0], requires_grad=True)

# Forward pass
a = x ** 2  # a = x^2
b = 3 * x  # b = 3x
c = a + b  # c = x^2 + 3x
y = c ** 2  # y = (x^2 + 3x)^2

# Backward pass
y.backward()

print(f"dy/dx = {x.grad}")  # Computed using chain rule
```

### Controlling Gradient Computation

```python
# Detach from computational graph
x = torch.tensor([1.0, 2.0], requires_grad=True)
y = x ** 2
z = y.detach()  # z is now a regular tensor, no gradient tracking

# Context manager: no gradient tracking
x = torch.tensor([1.0, 2.0], requires_grad=True)
with torch.no_grad():
    y = x ** 2  # No gradient will be computed
    print(y.requires_grad)  # False

# Inference mode (more efficient than no_grad)
with torch.inference_mode():
    y = x ** 2

# Enable/disable gradient tracking
x = torch.rand(3, 4)
x.requires_grad_(True)  # Enable
x.requires_grad_(False)  # Disable
```

### Computing Higher-Order Derivatives

```python
# Second derivative
x = torch.tensor([2.0], requires_grad=True)
y = x ** 3  # y = x^3

# First derivative
dy_dx = torch.autograd.grad(y, x, create_graph=True)[0]
print(f"dy/dx = {dy_dx}")  # 3x^2 = 12

# Second derivative
d2y_dx2 = torch.autograd.grad(dy_dx, x)[0]
print(f"d²y/dx² = {d2y_dx2}")  # 6x = 12
```

### Custom Gradient Functions

```python
class CustomFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        # Save for backward
        ctx.save_for_backward(input)
        return input.clamp(min=0)  # ReLU
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

# Usage
custom_relu = CustomFunction.apply
x = torch.randn(5, requires_grad=True)
y = custom_relu(x)
y.sum().backward()
```

### Gradient Accumulation

```python
# Gradients accumulate by default
x = torch.tensor([2.0], requires_grad=True)

for i in range(3):
    y = x ** 2
    y.backward()
    print(f"Iteration {i+1}, gradient: {x.grad}")

# Reset gradients
x.grad.zero_()  # or x.grad = None
```

---

## 4. Neural Networks with torch.nn

### Building Blocks

#### Linear Layer (Fully Connected)

```python
import torch.nn as nn

# Linear layer: y = xW^T + b
linear = nn.Linear(in_features=10, out_features=5)

# Parameters
print(f"Weight shape: {linear.weight.shape}")  # (5, 10)
print(f"Bias shape: {linear.bias.shape}")  # (5,)

# Forward pass
x = torch.rand(3, 10)  # Batch of 3 samples
output = linear(x)  # shape: (3, 5)
```

#### Activation Functions

```python
# ReLU: max(0, x)
relu = nn.ReLU()
x = torch.tensor([-1.0, 0.0, 1.0])
output = relu(x)  # [0.0, 0.0, 1.0]

# Leaky ReLU: max(0.01x, x)
leaky_relu = nn.LeakyReLU(negative_slope=0.01)

# Sigmoid: 1 / (1 + e^(-x))
sigmoid = nn.Sigmoid()

# Tanh: (e^x - e^(-x)) / (e^x + e^(-x))
tanh = nn.Tanh()

# Softmax: e^xi / sum(e^xj)
softmax = nn.Softmax(dim=1)
x = torch.rand(2, 5)
output = softmax(x)  # Each row sums to 1

# GELU (used in transformers)
gelu = nn.GELU()

# Swish/SiLU: x * sigmoid(x)
silu = nn.SiLU()
```

#### Convolutional Layers

```python
# 2D Convolution
conv2d = nn.Conv2d(
    in_channels=3,      # RGB input
    out_channels=64,    # 64 filters
    kernel_size=3,      # 3x3 kernel
    stride=1,           # Stride
    padding=1,          # Padding
    bias=True
)

# Input: (batch, channels, height, width)
x = torch.rand(8, 3, 32, 32)  # 8 RGB images of 32x32
output = conv2d(x)  # shape: (8, 64, 32, 32)

# 1D Convolution (for sequences)
conv1d = nn.Conv1d(
    in_channels=10,
    out_channels=20,
    kernel_size=3
)

# 3D Convolution (for video/volumetric data)
conv3d = nn.Conv3d(
    in_channels=1,
    out_channels=32,
    kernel_size=3
)

# Transposed Convolution (upsampling)
deconv = nn.ConvTranspose2d(
    in_channels=64,
    out_channels=3,
    kernel_size=4,
    stride=2,
    padding=1
)
```

#### Pooling Layers

```python
# Max Pooling
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
x = torch.rand(1, 1, 4, 4)
output = maxpool(x)  # shape: (1, 1, 2, 2)

# Average Pooling
avgpool = nn.AvgPool2d(kernel_size=2)

# Adaptive Pooling (output size specified)
adaptive_maxpool = nn.AdaptiveMaxPool2d(output_size=(7, 7))
x = torch.rand(1, 512, 14, 14)
output = adaptive_maxpool(x)  # shape: (1, 512, 7, 7)

# Global Average Pooling
gap = nn.AdaptiveAvgPool2d(output_size=(1, 1))
x = torch.rand(1, 512, 7, 7)
output = gap(x)  # shape: (1, 512, 1, 1)
```

#### Normalization Layers

```python
# Batch Normalization
batch_norm = nn.BatchNorm2d(num_features=64)
x = torch.rand(8, 64, 32, 32)
output = batch_norm(x)

# Layer Normalization
layer_norm = nn.LayerNorm(normalized_shape=[64, 32, 32])

# Instance Normalization
instance_norm = nn.InstanceNorm2d(num_features=64)

# Group Normalization
group_norm = nn.GroupNorm(num_groups=8, num_channels=64)
```

#### Dropout

```python
# Regular Dropout
dropout = nn.Dropout(p=0.5)  # Drop 50% of neurons
x = torch.rand(10, 100)
output = dropout(x)

# 2D Dropout (for convolutional layers)
dropout2d = nn.Dropout2d(p=0.5)

# Alpha Dropout (for SELU activation)
alpha_dropout = nn.AlphaDropout(p=0.5)
```

#### Recurrent Layers

```python
# LSTM
lstm = nn.LSTM(
    input_size=10,      # Input feature size
    hidden_size=20,     # Hidden state size
    num_layers=2,       # Number of stacked LSTMs
    batch_first=True,   # Input shape: (batch, seq, features)
    dropout=0.5,        # Dropout between layers
    bidirectional=False
)

# Input: (batch, sequence_length, features)
x = torch.rand(5, 30, 10)  # 5 sequences of length 30
output, (h_n, c_n) = lstm(x)
print(f"Output shape: {output.shape}")  # (5, 30, 20)
print(f"Hidden state shape: {h_n.shape}")  # (2, 5, 20)

# GRU (Gated Recurrent Unit)
gru = nn.GRU(
    input_size=10,
    hidden_size=20,
    num_layers=2,
    batch_first=True
)

# Simple RNN
rnn = nn.RNN(
    input_size=10,
    hidden_size=20,
    num_layers=2,
    batch_first=True
)
```

#### Attention Mechanisms

```python
# Multi-Head Attention
multihead_attn = nn.MultiheadAttention(
    embed_dim=512,      # Embedding dimension
    num_heads=8,        # Number of attention heads
    dropout=0.1,
    batch_first=True
)

# Query, Key, Value
Q = torch.rand(10, 32, 512)  # (batch, seq_len, embed_dim)
K = torch.rand(10, 32, 512)
V = torch.rand(10, 32, 512)

attn_output, attn_weights = multihead_attn(Q, K, V)
print(f"Output shape: {attn_output.shape}")  # (10, 32, 512)
```

#### Transformer Components

```python
# Transformer Encoder Layer
encoder_layer = nn.TransformerEncoderLayer(
    d_model=512,        # Embedding dimension
    nhead=8,            # Number of heads
    dim_feedforward=2048,
    dropout=0.1,
    batch_first=True
)

# Transformer Encoder (stack of encoder layers)
transformer_encoder = nn.TransformerEncoder(
    encoder_layer,
    num_layers=6
)

# Input
src = torch.rand(10, 32, 512)  # (batch, seq_len, d_model)
output = transformer_encoder(src)

# Transformer Decoder Layer
decoder_layer = nn.TransformerDecoderLayer(
    d_model=512,
    nhead=8,
    dim_feedforward=2048,
    batch_first=True
)

# Complete Transformer
transformer = nn.Transformer(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    batch_first=True
)
```

### Building Custom Networks

#### Method 1: Sequential

```python
# Simple feedforward network
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(128, 10)
)

# Forward pass
x = torch.rand(32, 784)
output = model(x)
```

#### Method 2: nn.Module (Recommended)

```python
class CustomNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(CustomNet, self).__init__()
        
        # Define layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        
        self.fc3 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Define forward pass
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        return x

# Instantiate
model = CustomNet(input_size=784, hidden_size=256, num_classes=10)

# Forward pass
x = torch.rand(32, 784)
output = model(x)
```

#### Complex Architecture Example: ResNet Block

```python
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, 
                               kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.conv2 = nn.Conv2d(out_channels, out_channels,
                               kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # Shortcut connection
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels,
                          kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        # Add shortcut
        out += self.shortcut(identity)
        out = self.relu(out)
        
        return out
```

### Model Inspection

```python
model = CustomNet(784, 256, 10)

# Print model architecture
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

# Access specific layers
print(model.fc1.weight.shape)
print(model.fc1.bias.shape)

# Iterate over parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# Iterate over modules
for name, module in model.named_modules():
    print(f"{name}: {type(module)}")
```

---

## 5. Optimization

### Optimizers

```python
model = CustomNet(784, 256, 10)

# SGD (Stochastic Gradient Descent)
optimizer = optim.SGD(
    model.parameters(),
    lr=0.01,            # Learning rate
    momentum=0.9,       # Momentum
    weight_decay=1e-4,  # L2 regularization
    nesterov=True       # Nesterov momentum
)

# Adam (Adaptive Moment Estimation)
optimizer = optim.Adam(
    model.parameters(),
    lr=0.001,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0
)

# AdamW (Adam with decoupled weight decay)
optimizer = optim.AdamW(
    model.parameters(),
    lr=0.001,
    betas=(0.9, 0.999),
    weight_decay=0.01
)

# RMSprop
optimizer = optim.RMSprop(
    model.parameters(),
    lr=0.01,
    alpha=0.99,
    momentum=0
)

# Adagrad
optimizer = optim.Adagrad(
    model.parameters(),
    lr=0.01
)

# Adadelta
optimizer = optim.Adadelta(
    model.parameters(),
    lr=1.0,
    rho=0.9
)
```

### Learning Rate Schedulers

```python
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step decay
scheduler = optim.lr_scheduler.StepLR(
    optimizer,
    step_size=30,  # Decay every 30 epochs
    gamma=0.1      # Multiply lr by 0.1
)

# Multi-step decay
scheduler = optim.lr_scheduler.MultiStepLR(
    optimizer,
    milestones=[30, 80, 120],
    gamma=0.1
)

# Exponential decay
scheduler = optim.lr_scheduler.ExponentialLR(
    optimizer,
    gamma=0.95
)

# Cosine annealing
scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=100,  # Maximum number of iterations
    eta_min=0   # Minimum learning rate
)

# Reduce on plateau
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',       # min for loss, max for accuracy
    factor=0.5,       # Multiply lr by 0.5
    patience=10,      # Wait 10 epochs
    verbose=True
)

# Cosine annealing with warm restarts
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=10,    # Initial restart period
    T_mult=2,  # Multiply period by 2 after each restart
    eta_min=0
)

# One Cycle Policy (