In [3]:
import torch
import torch.nn as nn
import numpy as np

# Define a simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

# Parameters
input_size = 1
hidden_size = 10  # Small hidden size to make the effect more pronounced
output_size = 1
sequence_length = 50  # Try varying this to see the effect on vanishing gradients

# Initialize the model
model = SimpleRNN(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Synthetic dataset: input is a sequence of zeros with a one at the end, target is the same
inputs = torch.zeros(1, sequence_length, 1)
inputs[:, -1, 0] = 1  # Set the last input to one
targets = inputs.clone()  # Target is the same as input

# Training loop
for epoch in range(200):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    # Check and print the gradient norms for the first layer weights
    if epoch % 20 == 0:
        first_layer_gradients_norm = model.rnn.weight_ih_l0.grad.norm().item()
        print(f'Epoch [{epoch+1}/200], Loss: {loss.item():.4f}, First layer gradient norm: {first_layer_gradients_norm:.4f}')



Epoch [1/200], Loss: 0.1934, First layer gradient norm: 0.0293
Epoch [21/200], Loss: 0.0172, First layer gradient norm: 0.0154
Epoch [41/200], Loss: 0.0088, First layer gradient norm: 0.0105
Epoch [61/200], Loss: 0.0034, First layer gradient norm: 0.0064
Epoch [81/200], Loss: 0.0007, First layer gradient norm: 0.0026
Epoch [101/200], Loss: 0.0002, First layer gradient norm: 0.0004
Epoch [121/200], Loss: 0.0001, First layer gradient norm: 0.0002
Epoch [141/200], Loss: 0.0001, First layer gradient norm: 0.0001
Epoch [161/200], Loss: 0.0000, First layer gradient norm: 0.0000
Epoch [181/200], Loss: 0.0000, First layer gradient norm: 0.0000
