In [3]:
import os
import torch
from torch import nn
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torchvision import transforms

In [4]:
class MLP(nn.Module):
    '''
    Multilayer Perceptron.
    '''
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
          nn.Flatten(),
          nn.Linear(28 * 28 * 1, 64),
          nn.ReLU(),
          nn.Linear(64, 32),
          nn.ReLU(),
          nn.Linear(32, 10)
        )

    def forward(self, x):
        '''Forward pass'''
        return self.layers(x)
  
    def compute_l2_loss(self, w):
        return torch.square(w).sum()

**L2 Regularization**, also called **Ridge Regularization**, involves adding the squared value of all weights to the loss value.

Implementing L2 Regularization with PyTorch is also easy. Understand that in this case, we don't take the absolute value for the weight values, but rather their squares. In other words, we add $\sum_f{ _{i=1}^{n}} w_i^2$ to the loss component. In the example below, you can find how L2 Regularization can be used with PyTorch:

In [6]:
if __name__ == '__main__':
  
    # Set fixed random number seed
    torch.manual_seed(42)
  
    # Prepare MNIST dataset
    dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True, num_workers=6)
  
    # Initialize the MLP
    mlp = MLP()
  
    # Define the loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(mlp.parameters(), lr=1e-4)
  
    # Run the training loop
    for epoch in range(0, 5): # 5 epochs at maximum
    
        # Print epoch
        print(f'Starting epoch {epoch+1}')
    
        # Iterate over the DataLoader for training data
        for i, data in enumerate(trainloader, 0):
      
            # Get inputs
            inputs, targets = data
      
            # Zero the gradients
            optimizer.zero_grad()
      
            # Perform forward pass
            outputs = mlp(inputs)
      
            # Compute loss
            loss = loss_function(outputs, targets)
      
            # Compute l2 loss component
            l2_weight = 1.0
            l2_parameters = []
            for parameter in mlp.parameters():
                l2_parameters.append(parameter.view(-1))
            l2 = l2_weight * mlp.compute_l2_loss(torch.cat(l2_parameters))
      
            # Add L2 loss component
            loss += l2
      
            # Perform backward pass
            loss.backward()
      
            # Perform optimization
            optimizer.step()
      
            # Print statistics
            minibatch_loss = loss.item()
            if i % 500 == 499:
                print('Loss after mini-batch %5d: %.5f (of which %.5f l2 loss)' %(i + 1, minibatch_loss, l2))
                #current_loss = 0.0

    # Process is complete.
    print('Training process has finished.')

Starting epoch 1
Loss after mini-batch   500: 6.91637 (of which 4.61870 l2 loss)
Starting epoch 2
Loss after mini-batch   500: 2.65406 (of which 0.34957 l2 loss)
Starting epoch 3
Loss after mini-batch   500: 2.31669 (of which 0.01494 l2 loss)
Starting epoch 4
Loss after mini-batch   500: 2.30337 (of which 0.00030 l2 loss)
Starting epoch 5
Loss after mini-batch   500: 2.30208 (of which 0.00007 l2 loss)
Training process has finished.


### Different way of adding L2 loss

L2 based weight decay can also be implemented by setting a delta value for weight_decay in the optimizer. 

> weight_decay (float, optional) – weight decay (L2 penalty) (default: 0)

For example: 
```
optimizer = torch.optim.AdamW(mlp.parameters(), lr=1e-4, weight_decay=1.0)
```