# Using L2 Regularization with Weight Decay

## Step 1: Setting Up the Environment


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set a seed for reproducibility
torch.manual_seed(42)

# Determine the device to run the model on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Step 2: Defining a Neural Network


In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_size=784, hidden_size1=128, hidden_size2=64, num_classes=10):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, num_classes)

    def forward(self, x):
        # Assuming x might be 2D image data, flatten it
        # For example, input of shape (batch_size, 28, 28) or (batch_size, 1, 28, 28)
        # If data is already flat (batch_size, 784), this line is still safe.
        x = x.view(x.size(0), -1)

        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x) # No softmax here, as CrossEntropyLoss will apply it
        return x

## Step 3: Preparing Dummy Data


In [None]:
# Parameters for dummy data
num_samples = 1000
batch_size = 64
input_features = 784  # e.g., 28x28 images flattened
num_classes = 10

# Create dummy data and move to the selected device
dummy_inputs = torch.randn(num_samples, input_features, device=device)
dummy_labels = torch.randint(0, num_classes, (num_samples,), device=device)

# Create a DataLoader
train_dataset = TensorDataset(dummy_inputs, dummy_labels)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

## Step 4: Incorporating Weight Decay into the Optimizer


In [None]:
# --- Hyperparameters ---
learning_rate = 0.01
weight_decay_lambda = 1e-4

# Instantiate the model and move it to the device
model = SimpleNN(input_size=input_features, num_classes=num_classes).to(device)

# Optimizer with weight decay
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay_lambda)

## Step 5: Training the Model with L2 Regularization


In [None]:
num_epochs = 10
criterion = nn.CrossEntropyLoss()

model.train() # Set the model to training mode

for epoch in range(num_epochs):
    epoch_loss = 0.0
    num_batches = 0
    for inputs, labels in train_loader:
        # Data is already on the device if created there.
        # If data loader loads from disk, inputs and labels would need .to(device)
        # inputs, labels = inputs.to(device), labels.to(device) # Already handled by creating tensors on device

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        num_batches += 1

    avg_epoch_loss = epoch_loss / num_batches
    # The print statement now shows average loss for the epoch
    print(f'Epoch [{epoch+1}/{num_epochs}]: Loss with weight decay - {avg_epoch_loss:.4f}')

# (Optional) To compare, you would train another model instance
# with an optimizer that has weight_decay=0 (or not set).
# model_no_wd = SimpleNN(input_size=input_features, num_classes=num_classes).to(device)
# optimizer_no_wd = optim.Adam(model_no_wd.parameters(), lr=learning_rate)
# Then train model_no_wd similarly and compare validation performance.

Epoch [1/10]: Loss with weight decay - 2.3302
Epoch [2/10]: Loss with weight decay - 1.5571
Epoch [3/10]: Loss with weight decay - 0.5145
Epoch [4/10]: Loss with weight decay - 0.1600
Epoch [5/10]: Loss with weight decay - 0.0765
Epoch [6/10]: Loss with weight decay - 0.0334
Epoch [7/10]: Loss with weight decay - 0.0420
Epoch [8/10]: Loss with weight decay - 0.0286
Epoch [9/10]: Loss with weight decay - 0.0212
Epoch [10/10]: Loss with weight decay - 0.0266
