In [1]:
import numpy as np

In [2]:
a = np.random.rand(12, 2)
b = np.random.rand(12, 2)

In [3]:
np.sum((a[:,0] - b[:,0])**2) + np.sum((a[:,1] - b[:,1])**2)

4.82375722592738

In [4]:
np.sum((a - b)**2)

4.82375722592738

In [5]:
import torch

In [6]:
net = torch.nn.Sequential(
    torch.nn.Linear(10, 2)
)
loss_fn = torch.nn.MSELoss()

In [7]:
inputs = torch.rand(8, 10, requires_grad=True)
labels = torch.rand(8, 2)

In [8]:
pred = net(inputs)

In [9]:
pred

tensor([[ 0.0797, -0.2756],
        [-0.0101, -0.1831],
        [ 0.0669, -0.0967],
        [-0.1646, -0.5744],
        [-0.0902, -0.3358],
        [-0.3111, -0.5636],
        [-0.2012, -0.3271],
        [-0.3644, -0.1183]], grad_fn=<AddmmBackward0>)

In [10]:
loss = loss_fn(pred, labels)
loss.backward()

In [11]:
loss

tensor(0.7818, grad_fn=<MseLossBackward0>)

In [14]:
class NeuralLayer:
    @staticmethod
    def get_activation_function(name):
        if name == "ReLU":
            return (
                lambda x: x*(x>0), # function
                lambda x: 1*(x>0) # gradient
            )
        if name == "None":
            return (
                lambda x: x,
                lambda x: 1
            )
        else:
            raise ValueError(f"Invalid Activation Function: {name}")
    
    def __init__(
        self, 
        shape, 
        activation_function = "ReLU"
    ):
        self.weights = np.random.rand(*shape[::-1])
        self.bias = np.random.rand(shape[1])
        self.activation, self.activation_gradient = NeuralLayer.get_activation_function(activation_function)
        self.weights_gradient = np.empty(shape[::-1])
        self.bias_gradient = np.empty(shape[1])
        # self.momentum = get_momentum_function(momentum)
        # self.regularization = get_regularization_function(regularization)
        
    def __call__(self, data):
        self.input_buffer = data
        output = data @ self.weights.T + self.bias.T
        return self.activation(output)
    
    def backward(self, output_gradient):
        self.bias_gradient[:] = output_gradient.sum(axis=0)
        self.weights_gradient[:] = output_gradient.T @ self.input_buffer
        input_gradient = (output_gradient @ self.weights)
        return input_gradient
        
    def update_weights(self):
        pass
    
    def zero_gradient(self):
        self.bias_gradient[:] = 0
        self.weights_gradient[:] = 0
        self.output_buffer = None

In [15]:
mynet = NeuralLayer((10,2), activation_function="None")

In [16]:
mynet.weights[:] = net[0].weight.detach().numpy()
mynet.bias[:] = net[0].bias.detach().numpy()
myinputs = inputs.detach().numpy()
mylabels = labels.numpy()
myloss_fn = lambda pred, labels: 1/(2*pred.shape[0]) * np.sum((labels - pred)**2)
myloss_grad = lambda pred, labels: 1/pred.shape[0] * (pred - labels)

In [17]:
mypred = mynet(myinputs)

In [18]:
mybackward = mynet.backward(myloss_grad(mypred, mylabels))

### $\frac{\partial L}{\partial b^L}$

In [19]:
net[0].bias.grad

tensor([-0.7730, -0.8788])

In [20]:
mynet.bias_gradient

array([-0.77296116, -0.87880203])

In [21]:
np.allclose(net[0].bias.grad, mynet.bias_gradient)

True

### $\frac{\partial L}{\partial W^L}$

In [22]:
net[0].weight.grad

tensor([[-0.2469, -0.4180, -0.4212, -0.4119, -0.2387, -0.5068, -0.4721, -0.4150,
         -0.3045, -0.4195],
        [-0.2739, -0.4735, -0.5399, -0.4461, -0.3524, -0.5359, -0.4754, -0.4382,
         -0.4070, -0.4753]])

In [23]:
mynet.weights_gradient

array([[-0.24693766, -0.41796969, -0.42115039, -0.41191583, -0.23870429,
        -0.50677284, -0.47206002, -0.41498892, -0.3045326 , -0.41948812],
       [-0.27385473, -0.47349682, -0.53985129, -0.44611956, -0.35236976,
        -0.5359183 , -0.47536654, -0.43817902, -0.40701409, -0.47530964]])

In [24]:
np.allclose(net[0].weight.grad, mynet.weights_gradient)

True

### $\frac{\partial L}{\partial o^{[L-1]}}$

In [25]:
inputs.grad

tensor([[ 0.0472,  0.0237, -0.0030,  0.0475,  0.0265, -0.0684,  0.0672, -0.0168,
          0.0556, -0.0270],
        [ 0.0344,  0.0145, -0.0062,  0.0351,  0.0161, -0.0499,  0.0494, -0.0066,
          0.0416, -0.0234],
        [ 0.0188,  0.0055, -0.0070,  0.0195,  0.0058, -0.0273,  0.0273,  0.0015,
          0.0237, -0.0161],
        [ 0.0322,  0.0281,  0.0154,  0.0306,  0.0323, -0.0467,  0.0442, -0.0361,
          0.0333, -0.0023],
        [ 0.0511,  0.0232, -0.0067,  0.0518,  0.0258, -0.0741,  0.0731, -0.0132,
          0.0612, -0.0324],
        [ 0.0500,  0.0156, -0.0169,  0.0518,  0.0169, -0.0725,  0.0725,  0.0017,
          0.0626, -0.0413],
        [ 0.0376,  0.0239,  0.0049,  0.0371,  0.0270, -0.0545,  0.0529, -0.0237,
          0.0424, -0.0148],
        [ 0.0386,  0.0047, -0.0239,  0.0411,  0.0043, -0.0560,  0.0571,  0.0165,
          0.0512, -0.0418]])

In [26]:
mybackward

array([[ 0.0471791 ,  0.0236846 , -0.00297638,  0.04752408,  0.02650434,
        -0.06843189,  0.06721985, -0.01677932,  0.05562619, -0.02695742],
       [ 0.03441968,  0.01453626, -0.00617188,  0.03509367,  0.01609052,
        -0.04993307,  0.04942417, -0.00659239,  0.04164364, -0.02335442],
       [ 0.0187929 ,  0.0054538 , -0.00699101,  0.01954318,  0.0058472 ,
        -0.02727065,  0.02733257,  0.00151409,  0.02369787, -0.01608931],
       [ 0.03219908,  0.02813908,  0.01543325,  0.03059085,  0.03225896,
        -0.04666741,  0.04420155, -0.0361133 ,  0.03333062, -0.00229955],
       [ 0.05107765,  0.0232364 , -0.00673042,  0.05182148,  0.02584814,
        -0.07409395,  0.0731109 , -0.01321209,  0.06115348, -0.03241868],
       [ 0.04999102,  0.01564844, -0.01693301,  0.05181118,  0.01690407,
        -0.07253922,  0.07254781,  0.00167819,  0.0625973 , -0.04126553],
       [ 0.03760526,  0.0238728 ,  0.00491175,  0.03711128,  0.02703602,
        -0.05453014,  0.0528806 , -0.02366027

In [27]:
np.allclose(inputs.grad, mybackward)

True