# Numerical Gradient Checking

We would highly recommend looking at `neural_networks.grad_check.check_gradients` and making sure you understand how numerical gradient checking is being carried out. This function is used in the notebook to check the gradients of the neural network layers you write. Make sure to check the gradient of a layer after finishing its implementation.

The function returns the relative error of the numerical gradient (approximated using finite differences) with respect to the analytical gradient (computed via backpropagation). Correct implementations should get very small errors, usually less than `1e-8` for 64-bit float matrices (the default).

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
from neural_networks.utils import check_gradients
from neural_networks.layers import FullyConnected
from neural_networks.activations import Linear, ReLU, SoftMax

## Gradient Checks for Activation Functions

### Linear Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
linear_activation = Linear()
_ = linear_activation.forward(X)
grad = linear_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for linear activation:",
    check_gradients(
        fn=linear_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### ReLU Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
relu_activation = ReLU()
out = relu_activation.forward(X)
grad = relu_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=relu_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### Softmax Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
softmax_activation = SoftMax()
_ = softmax_activation.forward(X)
grad = softmax_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for softmax activation:",
    check_gradients(
        fn=softmax_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

## Gradient Checks for Full Layers (Linear Activations)

### Fully Connected Layer

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="linear")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )