In [1]:
import numpy as np
import matplotlib.pyplot as plt
from utils import save_plot
from dl_from_scratch.neural_network_numpy import NeuralNetwork

In [2]:
# Utility: Track gradients across layers
def track_gradients(nn, X, Y, epochs=100, activation="sigmoid"):
    grad_norms = []
    for epoch in range(epochs):
        cache = nn.forward(X)
        grads = nn.backward(cache, Y)
        layer_norms = []
        for key in grads:
            if "dW" in key:  # Only weight gradients
                layer_norms.append(np.linalg.norm(grads[key]))
        grad_norms.append(layer_norms)
        nn.update(grads)
    return np.array(grad_norms)

In [3]:
# Generate dummy dataset (binary classification, easy task)
np.random.seed(42)
X = np.random.randn(10, 500)  # 10 features, 500 samples
Y = (np.sum(X, axis=0) > 0).astype(int).reshape(1, -1)
Y_onehot = np.eye(2)[Y.flatten()].T

In [4]:
# Deep network: 10 hidden layers with small size
layer_sizes = [10] + [32]*10 + [2]

In [5]:
# Sigmoid network
nn_sigmoid = NeuralNetwork(layer_sizes=layer_sizes, activation="sigmoid", output_activation="softmax", 
                           loss="cross_entropy", optimizer="gd", lr=0.01)
grad_sigmoid = track_gradients(nn_sigmoid, X, Y_onehot, epochs=50)

In [6]:
# ReLU network
nn_relu = NeuralNetwork(layer_sizes=layer_sizes, activation="relu", output_activation="softmax", 
                        loss="cross_entropy", optimizer="gd", lr=0.01)
grad_relu = track_gradients(nn_relu, X, Y_onehot, epochs=50)

In [7]:
# Plot gradient norms
fig, ax = plt.subplots(figsize=(10, 5))
for i in range(grad_sigmoid.shape[1]):
    ax.plot(grad_sigmoid[:, i], label=f"Layer {i+1} (sigmoid)", alpha=0.6)
for i in range(grad_relu.shape[1]):
    ax.plot(grad_relu[:, i], label=f"Layer {i+1} (ReLU)", linestyle="--", alpha=0.6)
ax.set_title("Gradient Norms per Layer (Sigmoid vs ReLU)")
ax.set_xlabel("Epochs")
ax.set_ylabel("Gradient Norm")
ax.legend(ncol=2, fontsize=8)
ax.set_yscale("log")
save_plot(fig, "vanishing_gradients.png")

[INFO] Saved plot to results\vanishing_gradients.png
