In [1]:
import numpy as np
import torch

In [2]:
net = torch.nn.Sequential(torch.nn.Linear(10, 2))
loss_fn = torch.nn.MSELoss()

In [3]:
inputs = torch.rand(8, 10, requires_grad=True)
labels = torch.rand(8, 2)

In [4]:
pred = net(inputs)

In [5]:
pred

tensor([[ 0.4475,  0.0480],
        [ 0.4079,  0.1346],
        [ 0.3011, -0.2467],
        [ 0.5357, -0.3975],
        [ 0.3444, -0.3308],
        [ 0.2294,  0.0210],
        [ 0.3813, -0.4777],
        [ 0.2035, -0.0367]], grad_fn=<AddmmBackward0>)

In [6]:
loss = loss_fn(pred, labels)
loss.backward()

In [7]:
loss

tensor(0.4651, grad_fn=<MseLossBackward0>)

In [8]:
class NeuralLayer:
    @staticmethod
    def get_activation_function(name):
        if name == "ReLU":
            return (
                lambda x: x * (x > 0),  # function
                lambda x: 1 * (x > 0),  # gradient
            )
        if name == "None":
            return (lambda x: x, lambda x: 1)
        else:
            raise ValueError(f"Invalid Activation Function: {name}")

    def __init__(self, shape, activation_function="ReLU"):
        self.weights = np.random.rand(*shape[::-1])
        self.bias = np.random.rand(shape[1])
        self.activation, self.activation_gradient = NeuralLayer.get_activation_function(
            activation_function
        )
        self.weights_gradient = np.empty(shape[::-1])
        self.bias_gradient = np.empty(shape[1])
        # self.momentum = get_momentum_function(momentum)
        # self.regularization = get_regularization_function(regularization)

    def __call__(self, data):
        self.input_buffer = data
        output = data @ self.weights.T + self.bias.T
        return self.activation(output)

    def backward(self, output_gradient):
        self.bias_gradient[:] = output_gradient.sum(axis=0)
        self.weights_gradient[:] = output_gradient.T @ self.input_buffer
        input_gradient = output_gradient @ self.weights
        return input_gradient

    def update_weights(self):
        pass

    def zero_gradient(self):
        self.bias_gradient[:] = 0
        self.weights_gradient[:] = 0
        self.output_buffer = None

In [9]:
mynet = NeuralLayer((10, 2), activation_function="None")

In [10]:
mynet.weights[:] = net[0].weight.detach().numpy()
mynet.bias[:] = net[0].bias.detach().numpy()
myinputs = inputs.detach().numpy()
mylabels = labels.numpy()
myloss_fn = lambda pred, labels: 1 / (2 * pred.shape[0]) * np.sum((labels - pred) ** 2)
myloss_grad = lambda pred, labels: 1 / pred.shape[0] * (pred - labels)

In [11]:
mypred = mynet(myinputs)

In [12]:
mybackward = mynet.backward(myloss_grad(mypred, mylabels))

### $\frac{\partial L}{\partial b^L}$

In [13]:
net[0].bias.grad

tensor([ 0.0743, -0.8501])

In [14]:
mynet.bias_gradient

array([ 0.07430577, -0.85012491])

In [15]:
np.allclose(net[0].bias.grad, mynet.bias_gradient)

True

### $\frac{\partial L}{\partial W^L}$

In [16]:
net[0].weight.grad

tensor([[ 0.0148, -0.0133,  0.0219,  0.0349,  0.0056,  0.0392,  0.0823,  0.0590,
          0.0573,  0.0581],
        [-0.3811, -0.3896, -0.1694, -0.5736, -0.3210, -0.5264, -0.4577, -0.5850,
         -0.5833, -0.3715]])

In [17]:
mynet.weights_gradient

array([[ 0.01478988, -0.01328581,  0.02185604,  0.03489794,  0.00556204,
         0.03922273,  0.0822954 ,  0.05895532,  0.05734129,  0.058073  ],
       [-0.38112705, -0.38959436, -0.16942462, -0.57362658, -0.32099691,
        -0.52636526, -0.45766078, -0.58502152, -0.58333836, -0.37150585]])

In [18]:
np.allclose(net[0].weight.grad, mynet.weights_gradient)

True

### $\frac{\partial L}{\partial o^{[L-1]}}$

In [19]:
inputs.grad

tensor([[ 3.3835e-02, -3.0327e-02, -1.3513e-02, -9.9635e-03,  1.4540e-02,
          2.4095e-02,  1.7294e-03, -1.6324e-03,  3.2671e-02, -3.3544e-02],
        [ 3.0534e-02, -2.6590e-02, -1.1586e-02, -6.9888e-03,  1.2372e-02,
          1.8936e-02,  3.4274e-03, -1.8047e-03,  2.7572e-02, -2.8463e-02],
        [ 2.7289e-02, -3.0793e-02, -1.5853e-02, -2.4326e-02,  1.7820e-02,
          4.2274e-02, -1.3788e-02,  1.3797e-03,  4.1903e-02, -4.1766e-02],
        [ 1.1901e-02, -1.4558e-02, -7.7970e-03, -1.3514e-02,  8.8579e-03,
          2.2509e-02, -8.7205e-03,  1.0825e-03,  2.1047e-02, -2.0838e-02],
        [ 3.1265e-02, -3.2798e-02, -1.6222e-02, -2.1489e-02,  1.8029e-02,
          3.9486e-02, -9.8492e-03,  5.2450e-04,  4.1915e-02, -4.2088e-02],
        [-1.4568e-02,  4.9667e-03, -5.1166e-04, -1.6524e-02,  1.5249e-03,
          1.8809e-02, -2.0144e-02,  4.1479e-03,  5.8038e-03, -4.3545e-03],
        [ 4.1031e-02, -4.1469e-02, -2.0058e-02, -2.4151e-02,  2.2146e-02,
          4.6141e-02, -9.1509e-0

In [20]:
mybackward

array([[ 3.38347787e-02, -3.03271337e-02, -1.35132706e-02,
        -9.96354602e-03,  1.45398845e-02,  2.40947459e-02,
         1.72939233e-03, -1.63240071e-03,  3.26714022e-02,
        -3.35436006e-02],
       [ 3.05344453e-02, -2.65903691e-02, -1.15860420e-02,
        -6.98883274e-03,  1.23724881e-02,  1.89362825e-02,
         3.42736876e-03, -1.80467297e-03,  2.75724619e-02,
        -2.84629042e-02],
       [ 2.72892517e-02, -3.07927155e-02, -1.58532281e-02,
        -2.43261306e-02,  1.78201623e-02,  4.22739179e-02,
        -1.37876645e-02,  1.37965361e-03,  4.19028534e-02,
        -4.17659656e-02],
       [ 1.19004995e-02, -1.45577665e-02, -7.79700408e-03,
        -1.35137671e-02,  8.85789426e-03,  2.25088701e-02,
        -8.72052803e-03,  1.08254532e-03,  2.10470970e-02,
        -2.08375190e-02],
       [ 3.12646330e-02, -3.27979900e-02, -1.62220906e-02,
        -2.14889723e-02,  1.80294386e-02,  3.94855224e-02,
        -9.84915527e-03,  5.24502185e-04,  4.19153412e-02,
        -4.

In [21]:
np.allclose(inputs.grad, mybackward)

True