In [3]:
import torch
import torch.nn as nn

**A deep neural network with five layers, each consisting of a
Linear layer and a GELU activation function**

In [4]:
class DeepNeuralNetwork(nn.Module):
  def __init__(self, layer_sizes, use_shortcut):
      super().__init__()
      self.use_shortcut = use_shortcut
      self.layers = nn.ModuleList([
          nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), nn.GELU()),
          nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), nn.GELU()),
          nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), nn.GELU()),
          nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), nn.GELU()),
          nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), nn.GELU())
      ])

# Pass the inputs through layers and add shortcut if self.use_shortcut is True
  def forward(self, x):
    for layer in self.layers:
        layer_output = layer(x)
        if self.use_shortcut and x.shape == layer_output.shape:
            x = x + layer_output
        else:
            x = layer_output
    return x

**Initialize neural network without shortcut connections**

In [5]:
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123)
model_without_shortcut = DeepNeuralNetwork(
 layer_sizes, use_shortcut=False
)

**Compute the Gradient in the Model's backward pass**

In [6]:
def print_gradients(model, x):
 output = model(x) # Forward pass
 target = torch.tensor([[0.]])

 loss = nn.MSELoss()
 loss = loss(output, target) #Calculate loss based on how close the target and output are

 loss.backward() #Backward pass to calculate gradient

 for name, param in model.named_parameters():
  if 'weight' in name:
    print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

 **The gradients become smaller as we progress from layer 4 to layer 0.
This is vanishing gradient problem**

In [7]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020174118981231004
layers.1.0.weight has gradient mean of 0.00012011769285891205
layers.2.0.weight has gradient mean of 0.0007152436301112175
layers.3.0.weight has gradient mean of 0.00139885104727
layers.4.0.weight has gradient mean of 0.005049602594226599


**Instantiating the model with skip connection**.
**The gradient does shrink vanishingly to a small value**



In [8]:
torch.manual_seed(123)
model_with_shortcut = DeepNeuralNetwork(
 layer_sizes, use_shortcut=True
)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.22186800837516785
layers.1.0.weight has gradient mean of 0.20709273219108582
layers.2.0.weight has gradient mean of 0.3292388319969177
layers.3.0.weight has gradient mean of 0.2667772173881531
layers.4.0.weight has gradient mean of 1.3268063068389893


Shortcut connections are important for overcoming the limitations
posed by the vanishing gradient problem in deep neural networks.