In [1]:
import numpy as np
import torch

In [2]:
net = torch.nn.Sequential(
    torch.nn.Linear(10, 2)
)
loss_fn = torch.nn.MSELoss()

In [3]:
inputs = torch.rand(8, 10, requires_grad=True)
labels = torch.rand(8, 2)

In [4]:
pred = net(inputs)

In [5]:
pred

tensor([[ 3.8142e-02, -2.4399e-01],
        [-1.5580e-01, -4.1225e-01],
        [ 3.2097e-01, -5.1974e-01],
        [-1.6614e-02, -2.4831e-01],
        [-5.6091e-04, -5.4096e-01],
        [ 7.7815e-02, -6.8261e-01],
        [-5.6924e-03, -2.7929e-01],
        [-9.4297e-02, -3.3948e-01]], grad_fn=<AddmmBackward0>)

In [6]:
loss = loss_fn(pred, labels)
loss.backward()

In [7]:
loss

tensor(0.6876, grad_fn=<MseLossBackward0>)

In [8]:
class NeuralLayer:
    @staticmethod
    def get_activation_function(name):
        if name == "ReLU":
            return (
                lambda x: x*(x>0), # function
                lambda x: 1*(x>0) # gradient
            )
        if name == "None":
            return (
                lambda x: x,
                lambda x: 1
            )
        else:
            raise ValueError(f"Invalid Activation Function: {name}")
    
    def __init__(
        self, 
        shape, 
        activation_function = "ReLU"
    ):
        self.weights = np.random.rand(*shape[::-1])
        self.bias = np.random.rand(shape[1])
        self.activation, self.activation_gradient = NeuralLayer.get_activation_function(activation_function)
        self.weights_gradient = np.empty(shape[::-1])
        self.bias_gradient = np.empty(shape[1])
        # self.momentum = get_momentum_function(momentum)
        # self.regularization = get_regularization_function(regularization)
        
    def __call__(self, data):
        self.input_buffer = data
        output = data @ self.weights.T + self.bias.T
        return self.activation(output)
    
    def backward(self, output_gradient):
        self.bias_gradient[:] = output_gradient.sum(axis=0)
        self.weights_gradient[:] = output_gradient.T @ self.input_buffer
        input_gradient = (output_gradient @ self.weights)
        return input_gradient
        
    def update_weights(self):
        pass
    
    def zero_gradient(self):
        self.bias_gradient[:] = 0
        self.weights_gradient[:] = 0
        self.output_buffer = None

In [9]:
mynet = NeuralLayer((10,2), activation_function="None")

In [10]:
mynet.weights[:] = net[0].weight.detach().numpy()
mynet.bias[:] = net[0].bias.detach().numpy()
myinputs = inputs.detach().numpy()
mylabels = labels.numpy()
myloss_fn = lambda pred, labels: 1/(2*pred.shape[0]) * np.sum((labels - pred)**2)
myloss_grad = lambda pred, labels: 1/pred.shape[0] * (pred - labels)

In [11]:
mypred = mynet(myinputs)

In [12]:
mybackward = mynet.backward(myloss_grad(mypred, mylabels))

### $\frac{\partial L}{\partial b^L}$

In [13]:
net[0].bias.grad

tensor([-0.5990, -0.9434])

In [14]:
mynet.bias_gradient

array([-0.5989859 , -0.94335283])

In [15]:
np.allclose(net[0].bias.grad, mynet.bias_gradient)

True

### $\frac{\partial L}{\partial W^L}$

In [16]:
net[0].weight.grad

tensor([[-0.3721, -0.2844, -0.2841, -0.2399, -0.3206, -0.2928, -0.2848, -0.2797,
         -0.2468, -0.2210],
        [-0.5271, -0.3468, -0.4427, -0.3693, -0.4416, -0.5012, -0.4793, -0.4558,
         -0.3706, -0.4056]])

In [17]:
mynet.weights_gradient

array([[-0.3720902 , -0.28435787, -0.28412932, -0.23986145, -0.32058406,
        -0.29282174, -0.28483872, -0.27968048, -0.24678782, -0.22103372],
       [-0.5270703 , -0.34684077, -0.44267892, -0.36930918, -0.44159497,
        -0.50117398, -0.47931995, -0.45578168, -0.37064137, -0.40556333]])

In [18]:
np.allclose(net[0].weight.grad, mynet.weights_gradient)

True

### $\frac{\partial L}{\partial o^{[L-1]}}$

In [19]:
inputs.grad

tensor([[-0.0145,  0.0028,  0.0163,  0.0436,  0.0165, -0.0359,  0.0217,  0.0083,
         -0.0235,  0.0244],
        [ 0.0128,  0.0164,  0.0183,  0.0476,  0.0140, -0.0345,  0.0058, -0.0220,
         -0.0378, -0.0090],
        [-0.0029,  0.0072,  0.0148,  0.0390,  0.0133, -0.0305,  0.0131, -0.0035,
         -0.0253,  0.0093],
        [ 0.0077,  0.0119,  0.0146,  0.0381,  0.0115, -0.0280,  0.0063, -0.0148,
         -0.0291, -0.0040],
        [-0.0018,  0.0076,  0.0145,  0.0385,  0.0130, -0.0299,  0.0123, -0.0046,
         -0.0254,  0.0078],
        [ 0.0010,  0.0068,  0.0110,  0.0291,  0.0095, -0.0222,  0.0078, -0.0060,
         -0.0202,  0.0030],
        [ 0.0043,  0.0112,  0.0161,  0.0422,  0.0134, -0.0317,  0.0096, -0.0119,
         -0.0305,  0.0008],
        [ 0.0133,  0.0151,  0.0157,  0.0409,  0.0117, -0.0292,  0.0036, -0.0214,
         -0.0334, -0.0106]])

In [20]:
mybackward

array([[-0.01451984,  0.00277703,  0.01629723,  0.04360741,  0.0164942 ,
        -0.03593987,  0.02168799,  0.00832113, -0.02349492,  0.02439209],
       [ 0.01280766,  0.01637692,  0.01825308,  0.04759858,  0.01397548,
        -0.03447095,  0.00583379, -0.02202984, -0.03778836, -0.00903639],
       [-0.00290579,  0.00718625,  0.01475293,  0.03903777,  0.01334702,
        -0.03049788,  0.01313251, -0.00350837, -0.02530954,  0.0092766 ],
       [ 0.00767498,  0.01190685,  0.01456556,  0.0380914 ,  0.01154598,
        -0.02801333,  0.00627126, -0.01483449, -0.02914969, -0.00402767],
       [-0.00180112,  0.00757236,  0.01454838,  0.038451  ,  0.01299725,
        -0.0298633 ,  0.01227463, -0.00460757, -0.02537871,  0.00781683],
       [ 0.00096233,  0.00680622,  0.01103485,  0.02906533,  0.00949803,
        -0.02218798,  0.00783185, -0.00600587, -0.02016862,  0.00301697],
       [ 0.00428784,  0.0112255 ,  0.01606371,  0.04218778,  0.01337981,
        -0.03172538,  0.00956805, -0.01185623

In [21]:
np.allclose(inputs.grad, mybackward)

True