In [None]:
# learning how to build models for multiclass predictions



import torch
import torch.nn as nn

class MulticlassClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MulticlassClassifier, self).__init__()
        
        # Layer 1: Input ‚Üí Hidden
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        
        # Layer 2: Hidden ‚Üí Hidden
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        
        # Layer 3: Hidden ‚Üí Output
        self.fc3 = nn.Linear(hidden_size, num_classes)
        # No softmax here! CrossEntropyLoss includes it
        
    def forward(self, x):
        x = self.dropout1(self.relu1(self.fc1(x)))
        x = self.dropout2(self.relu2(self.fc2(x)))
        x = self.fc3(x)
        return x  # Raw logits, not probabilities yet

# Create model
model = MulticlassClassifier(input_size=3072,  # 32x32x3 flattened
                              hidden_size=128, 
                              num_classes=3)    # cat, dog, bird

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Too high (0.1): Loss jumps around, never converges, might diverge

Too low (0.00001): Training takes forever, gets stuck in local minima

Just right (0.001-0.0001): Steady improvement, reaches good solution

How to choose:

Start with 0.001 (Adam) or 0.01 (SGD)

If loss oscillates wildly ‚Üí decrease by 10x

If loss barely moves after 10 epochs ‚Üí increase by 2-3x

Use learning rate scheduling: Start high, decrease over time

In [None]:
# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)
# Reduces LR by 50% if validation loss doesn't improve for 5 epochs

Epoch 1-20: LR = 0.001 ‚Üí Loss drops from 2.5 to 0.8

Epoch 21-40: LR = 0.0005 ‚Üí Loss drops from 0.8 to 0.4

Epoch 41+: LR = 0.00025 ‚Üí Loss fine-tunes to 0.3


2. Batch Size - Memory vs Stability Trade-off

What it is: Number of examples processed before updating weights

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
```

**Why it matters**:
- **Small batch (8-16)**: 
  - ‚úÖ More updates per epoch (faster learning)
  - ‚úÖ Better generalization (noise helps escape local minima)
  - ‚ùå Slower per epoch (more iterations)
  - ‚ùå Noisy gradients (jumpy training)

- **Large batch (128-256)**:
  - ‚úÖ Faster per epoch (GPU efficient)
  - ‚úÖ Stable gradients (smooth training curve)
  - ‚ùå Fewer updates per epoch
  - ‚ùå Can overfit to training data

**How to choose**:
1. Start with **32** (good default)
2. Increase until GPU memory is full (use `nvidia-smi` to check)
3. If you increase batch size by 2x, increase learning rate by ‚àö2

**Memory calculation**:
```
GPU Memory = Batch Size √ó Model Size √ó Gradient Size
If batch 32 uses 4GB ‚Üí batch 64 uses ~8GB

Too few (10): Model hasn't learned enough (underfitting)
Too many (500): Model memorizes training data (overfitting)

How to choose: Use early stopping

In [None]:
best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(num_epochs):
    train_loss = train_one_epoch()
    val_loss = validate()
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        patience_counter = 0
    else:
        patience_counter += 1
        
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break





In [None]:
# Training loss: Keeps decreasing
# Validation loss: Decreases then plateaus or increases
# Stop when validation loss stops improving!

In [None]:
# What it is: Depth (layers) and width (neurons per layer)

# 2 hidden layers, 128 neurons each
self.fc1 = nn.Linear(input_size, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, num_classes)

More layers: Can learn hierarchical features (edges ‚Üí shapes ‚Üí objects)

More neurons: More representational capacity

Guidelines:

Simple data (tabular): 2-3 layers, 64-256 neurons

Images: Use CNNs instead (10-100+ layers)

Text: Use Transformers (12-96 layers)

In [None]:
# If training accuracy is low (< 80%), your model lacks capacity
# ‚Üí Add more neurons or layers

# If training accuracy is high (> 95%) but validation is low (< 70%)
# ‚Üí Model is too large, reduce capacity or add regularization

# Dropout - Preventing Overfitting
What it is: Randomly "turns off" neurons during training

In [None]:
self.dropout = nn.Dropout(0.5)  # 50% of neurons dropped

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# === SETUP ===
model = MulticlassClassifier(input_size=3072, hidden_size=128, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# === TRAINING LOOP ===
num_epochs = 100
best_val_loss = float('inf')

for epoch in range(num_epochs):
    # === TRAINING PHASE ===
    model.train()  # Enable dropout
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # inputs: [32, 3072] - batch of 32 images
        # targets: [32] - batch of 32 labels (0, 1, or 2)
        
        # === FORWARD PASS ===
        outputs = model(inputs)  # [32, 3] - 3 scores per image
        loss = criterion(outputs, targets)
        
        # === BACKWARD PASS ===
        optimizer.zero_grad()  # Clear old gradients
        loss.backward()        # Compute new gradients
        optimizer.step()       # Update weights
        
        # === TRACK METRICS ===
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)  # Get class with highest score
        train_total += targets.size(0)
        train_correct += (predicted == targets).sum().item()
    
    # === VALIDATION PHASE ===
    model.eval()  # Disable dropout
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():  # Don't compute gradients (saves memory)
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += targets.size(0)
            val_correct += (predicted == targets).sum().item()
    
    # === COMPUTE AVERAGES ===
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    train_acc = 100 * train_correct / train_total
    val_acc = 100 * val_correct / val_total
    
    # === LEARNING RATE SCHEDULING ===
    scheduler.step(val_loss)
    
    # === EARLY STOPPING ===
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        print(f'‚úì Model saved at epoch {epoch+1}')
    
    # === LOGGING ===
    print(f'Epoch [{epoch+1}/{num_epochs}]')
    print(f'  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%')
    print(f'  Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%')
    print(f'  LR: {optimizer.param_groups[0]["lr"]:.6f}')
```

---

## **üìà What You'll See During Training**

**Healthy Training**:
```
Epoch [1/100]
  Train Loss: 1.0986 | Train Acc: 33.52%  (random guessing)
  Val Loss: 1.0891 | Val Acc: 34.20%

Epoch [10/100]
  Train Loss: 0.6523 | Train Acc: 72.80%  (learning!)
  Val Loss: 0.7124 | Val Acc: 69.50%

Epoch [50/100]
  Train Loss: 0.2341 | Train Acc: 91.20%  (good fit)
  Val Loss: 0.3892 | Val Acc: 86.10%

Epoch [100/100]
  Train Loss: 0.1123 | Train Acc: 96.40%  (converged)
  Val Loss: 0.3654 | Val Acc: 87.30%
```

**Overfitting**:
```
Epoch [100/100]
  Train Loss: 0.0234 | Train Acc: 99.80%  ‚Üê Perfect on training
  Val Loss: 1.2341 | Val Acc: 72.10%      ‚Üê Bad on validation
  
‚Üí Model memorized training data!
‚Üí Solutions: Add dropout, reduce model size, get more data
```

**Underfitting**:
```
Epoch [100/100]
  Train Loss: 0.8234 | Train Acc: 68.20%  ‚Üê Can't even fit training data
  Val Loss: 0.8456 | Val Acc: 67.10%
  
‚Üí Model too simple or learning rate too low
‚Üí Solutions: Add layers/neurons, increase LR, train longer