# Lesson 29: PyTorch training loop activity - SOLUTION

This notebook contains the solution for adding batching and validation to the PyTorch training loop.

## Notebook set-up

### Imports

In [None]:
# Third party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set random seeds for reproducibility
torch.manual_seed(315)
np.random.seed(315)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 1. Load preprocessed data

In [None]:
data = pd.read_pickle('https://gperdrizet.github.io/FSA_devops/assets/data/unit4/preprocessed_housing_data.pkl')

training_df = data['training_df']
testing_df = data['testing_df']
features = data['features']
label = data['label']

print(f'Training samples: {len(training_df)}')
print(f'Testing samples: {len(testing_df)}')
print(f'Features: {features}')
print(f'Label: {label}')

## 2. Prepare PyTorch tensors and DataLoaders - SOLUTION

This section creates tensors, splits training data into train/validation sets, and creates DataLoaders for batching.

In [None]:
# Convert dataframes to PyTorch tensors and move to device
X_train_full = torch.tensor(training_df[features].values, dtype=torch.float32).to(device)
y_train_full = torch.tensor(training_df[label].values, dtype=torch.float32).unsqueeze(1).to(device)
X_test = torch.tensor(testing_df[features].values, dtype=torch.float32).to(device)
y_test = torch.tensor(testing_df[label].values, dtype=torch.float32).unsqueeze(1).to(device)

# Split training data into train and validation sets (80/20)
n_samples = X_train_full.shape[0]
n_val = int(n_samples * 0.2)
n_train = n_samples - n_val

# Shuffle indices for random split
indices = torch.randperm(n_samples)
train_indices = indices[:n_train]
val_indices = indices[n_train:]

# Create train and validation tensors
X_train = X_train_full[train_indices]
y_train = y_train_full[train_indices]
X_val = X_train_full[val_indices]
y_val = y_train_full[val_indices]

# Create DataLoaders for batching
batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f'Training samples: {n_train}')
print(f'Validation samples: {n_val}')
print(f'Batch size: {batch_size}')
print(f'Training batches per epoch: {len(train_loader)}')

## 3. Build model

In [None]:
model = nn.Sequential(
    nn.Linear(8, 64),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(32, 1)
).to(device)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

print(model)

## 4. Training function - SOLUTION

This solution accepts DataLoaders and iterates over batches, computing validation metrics after each epoch.

In [None]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    criterion: nn.Module,
    optimizer: optim.Optimizer,
    epochs: int = 50,
    print_every: int = 5
) -> dict[str, list[float]]:
    '''Training loop with batching and validation.
    
    Args:
        model: PyTorch model to train
        train_loader: DataLoader for training batches
        val_loader: DataLoader for validation data
        criterion: Loss function
        optimizer: Optimizer
        epochs: Number of training epochs
        print_every: Print progress every N epochs
    
    Returns:
        Dictionary containing training history
    '''
    
    history = {
        'loss': [], 
        'r2': [],
        'val_loss': [],
        'val_r2': []
    }
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        epoch_loss = 0.0
        n_batches = 0
        
        for X_batch, y_batch in train_loader:
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            predictions = model(X_batch)
            
            # Calculate loss
            loss = criterion(predictions, y_batch)
            
            # Backward pass
            loss.backward()
            
            # Update weights
            optimizer.step()
            
            epoch_loss += loss.item()
            n_batches += 1
        
        # Average training loss for the epoch
        avg_train_loss = epoch_loss / n_batches
        
        # Validation phase
        model.eval()
        with torch.no_grad():
            # Collect all training predictions for R² calculation
            all_train_preds = []
            all_train_labels = []
            for X_batch, y_batch in train_loader:
                preds = model(X_batch)
                all_train_preds.append(preds)
                all_train_labels.append(y_batch)
            
            train_preds = torch.cat(all_train_preds)
            train_labels = torch.cat(all_train_labels)
            train_loss = criterion(train_preds, train_labels).item()
            ss_res = torch.sum((train_labels - train_preds) ** 2)
            ss_tot = torch.sum((train_labels - torch.mean(train_labels)) ** 2)
            train_r2 = (1 - (ss_res / ss_tot)).item()
            
            # Collect all validation predictions
            all_val_preds = []
            all_val_labels = []
            for X_batch, y_batch in val_loader:
                preds = model(X_batch)
                all_val_preds.append(preds)
                all_val_labels.append(y_batch)
            
            val_preds = torch.cat(all_val_preds)
            val_labels = torch.cat(all_val_labels)
            val_loss = criterion(val_preds, val_labels).item()
            ss_res = torch.sum((val_labels - val_preds) ** 2)
            ss_tot = torch.sum((val_labels - torch.mean(val_labels)) ** 2)
            val_r2 = (1 - (ss_res / ss_tot)).item()
        
        # Record metrics
        history['loss'].append(train_loss)
        history['r2'].append(train_r2)
        history['val_loss'].append(val_loss)
        history['val_r2'].append(val_r2)
        
        # Print progress
        if (epoch + 1) % print_every == 0 or epoch == 0:
            print(f'Epoch {epoch+1}/{epochs} - '
                  f'loss: {train_loss:.4f} - R²: {train_r2:.4f} - '
                  f'val_loss: {val_loss:.4f} - val_R²: {val_r2:.4f}')
    
    print('\nTraining complete.')
    return history

## 5. Train model

In [None]:
history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    epochs=100,
    print_every=10
)

## 6. Learning curves

Now showing both training and validation metrics.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Loss plot
axes[0].set_title('Loss')
axes[0].plot(history['loss'], label='Training')
axes[0].plot(history['val_loss'], label='Validation')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss (MSE)')
axes[0].legend()

# R² plot
axes[1].set_title('R²')
axes[1].plot(history['r2'], label='Training')
axes[1].plot(history['val_r2'], label='Validation')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('R²')
axes[1].legend()

plt.tight_layout()
plt.show()

## 7. Test set evaluation

In [None]:
# Set model to evaluation mode
model.eval()

# Make predictions (no gradient calculation needed)
with torch.no_grad():
    predictions = model(X_test).cpu().numpy().flatten()

# Calculate R²
ss_res = np.sum((testing_df[label].values - predictions) ** 2)
ss_tot = np.sum((testing_df[label].values - np.mean(testing_df[label].values)) ** 2)
rsquared = 1 - (ss_res / ss_tot)

print(f'Model R² on test set: {rsquared:.4f}')

## 8. Performance analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4))

axes[0].set_title('Model predictions')
axes[0].scatter(
    testing_df[label], predictions,
    c='black', s=0.5, alpha=0.5
)
axes[0].plot(
    [testing_df[label].min(), testing_df[label].max()],
    [testing_df[label].min(), testing_df[label].max()],
    color='red', linestyle='--'
)
axes[0].set_xlabel('True values (standardized)')
axes[0].set_ylabel('Predicted values (standardized)')

axes[1].set_title('Residuals vs predicted values')
axes[1].scatter(
    predictions, testing_df[label] - predictions,
    c='black', s=0.5, alpha=0.5
)
axes[1].axhline(0, color='red', linestyle='--')
axes[1].set_xlabel('Predicted values (standardized)')
axes[1].set_ylabel('Residuals (standardized)')

plt.tight_layout()
plt.show()