In [1]:
import numpy as np

In [2]:
a = np.random.rand(12, 2)
b = np.random.rand(12, 2)

In [3]:
np.sum((a[:,0] - b[:,0])**2) + np.sum((a[:,1] - b[:,1])**2)

5.351521860538725

In [4]:
np.sum((a - b)**2)

5.351521860538725

In [5]:
import torch

In [6]:
net = torch.nn.Sequential(
    torch.nn.Linear(10, 2),
    # torch.nn.ReLU()
)
loss_fn = torch.nn.MSELoss()

In [7]:
inputs = torch.rand(8, 10, requires_grad=True)
labels = torch.rand(8, 2)

In [8]:
pred = net(inputs)

In [9]:
pred

tensor([[-0.0074,  0.6776],
        [ 0.5342,  0.5625],
        [ 0.3380,  0.4247],
        [ 0.4657,  0.1548],
        [ 0.2714,  0.8351],
        [ 0.1939,  0.6120],
        [ 0.4586,  0.2364],
        [ 0.2945,  0.1556]], grad_fn=<AddmmBackward0>)

In [10]:
loss = loss_fn(pred, labels)
loss.backward()

In [11]:
loss

tensor(0.0843, grad_fn=<MseLossBackward0>)

In [12]:
net[0].weight.grad.shape

torch.Size([2, 10])

In [13]:
import numpy as np
type(np.random.rand(2,3).shape)

tuple

In [14]:
class NeuralLayer:
    @staticmethod
    def get_activation_function(name):
        if name == "ReLU":
            return (
                lambda x: x*(x>0), # function
                lambda x: 1*(x>0) # gradient
            )
        if name == "None":
            return (
                lambda x: x,
                lambda x: 1
            )
        else:
            raise ValueError(f"Invalid Activation Function: {name}")
    
    def __init__(
        self, 
        shape, 
        activation_function = "ReLU"
    ):
        self.weights = np.random.rand(*shape[::-1])
        self.bias = np.random.rand(shape[1])
        self.activation, self.activation_gradient = NeuralLayer.get_activation_function(activation_function)
        self.weights_gradient = np.empty(shape[::-1])
        self.bias_gradient = np.empty(shape[1])
        # self.momentum = get_momentum_function(momentum)
        # self.regularization = get_regularization_function(regularization)
        
    def __call__(self, data):
        self.input_buffer = data
        output = data @ self.weights.T + self.bias.T
        return self.activation(output)
    
    def backward(self, output_gradient):
        self.bias_gradient[:] = output_gradient.sum(axis=0)
        self.weights_gradient[:] = output_gradient.T @ self.input_buffer
        input_gradient = (output_gradient @ self.weights)
        return input_gradient
        
    def update_weights(self):
        pass
    
    def zero_gradient(self):
        self.bias_gradient[:] = 0
        self.weights_gradient[:] = 0
        self.output_buffer = None

In [15]:
mynet = NeuralLayer((10,2), activation_function="None")

In [16]:
mynet.weights[:] = net[0].weight.detach().numpy()
mynet.bias[:] = net[0].bias.detach().numpy()
myinputs = inputs.detach().numpy()
mylabels = labels.numpy()
myloss_fn = lambda pred, labels: 1/(2*pred.shape[0]) * np.sum((labels - pred)**2)
myloss_grad = lambda pred, labels: 1/pred.shape[0] * (pred - labels)

In [17]:
mypred = mynet(myinputs)

### $\frac{\partial L}{\partial b^L}$

In [18]:
net[0].bias.grad

tensor([-0.0107, -0.1315])

In [19]:
myloss_grad(mypred, mylabels).sum(axis=0)

array([-0.01073851, -0.13148743])

### $\frac{\partial L}{\partial W^L}$

In [20]:
net[0].weight.grad

tensor([[ 0.0167, -0.0202, -0.0091,  0.0231,  0.0613,  0.0456, -0.0147,  0.0204,
         -0.0194,  0.0057],
        [-0.0576, -0.0726, -0.0731,  0.0101, -0.0225, -0.0834, -0.0330, -0.0435,
         -0.0400, -0.0713]])

In [21]:
myloss_grad(mypred, mylabels).T @ mynet.input_buffer

array([[ 0.01674437, -0.02023538, -0.00913341,  0.02308217,  0.06130684,
         0.04563345, -0.01467364,  0.02041105, -0.01937562,  0.0057479 ],
       [-0.05760419, -0.07262027, -0.07313376,  0.0100964 , -0.02253775,
        -0.08342972, -0.0329759 , -0.0434947 , -0.03999242, -0.07133868]])

### $\frac{\partial L}{\partial o^{[L-1]}}$

In [22]:
inputs.grad

tensor([[ 5.3356e-04, -3.9569e-03,  5.2732e-03,  2.0215e-03, -6.0722e-03,
         -4.5945e-03,  6.3452e-03, -6.6904e-03, -2.5719e-05,  1.0113e-03],
        [ 3.8793e-03,  1.9305e-02, -4.6591e-05, -1.5163e-02,  5.3236e-03,
          1.2374e-02, -1.7701e-02,  5.9868e-03, -6.9876e-03, -9.7980e-03],
        [-4.4639e-04, -4.4226e-03,  1.7629e-03,  3.1120e-03, -2.8778e-03,
         -3.5199e-03,  4.9596e-03, -3.1902e-03,  1.1154e-03,  1.9127e-03],
        [ 1.7581e-03,  1.3312e-02, -3.6649e-03, -9.7062e-03,  7.1088e-03,
          9.9532e-03, -1.4081e-02,  7.8989e-03, -3.8122e-03, -6.0684e-03],
        [-1.8511e-03, -4.8257e-03, -3.4801e-03,  4.5109e-03,  1.9735e-03,
         -1.7276e-03,  2.6221e-03,  2.1277e-03,  2.7138e-03,  3.1105e-03],
        [-1.9843e-03, -3.4129e-03, -5.1356e-03,  3.7423e-03,  3.9265e-03,
         -1.7600e-04,  4.7391e-04,  4.2805e-03,  2.6601e-03,  2.7064e-03],
        [ 1.7446e-03,  3.6840e-03,  3.9696e-03, -3.7146e-03, -2.7490e-03,
          8.0542e-04, -1.3240e-0

In [23]:
myloss_grad(mypred, mylabels) @ mynet.weights

array([[ 5.33562567e-04, -3.95687959e-03,  5.27319249e-03,
         2.02153306e-03, -6.07218158e-03, -4.59454736e-03,
         6.34518647e-03, -6.69035362e-03, -2.57197519e-05,
         1.01131409e-03],
       [ 3.87932328e-03,  1.93051496e-02, -4.65914507e-05,
        -1.51629625e-02,  5.32361932e-03,  1.23736871e-02,
        -1.77005520e-02,  5.98675137e-03, -6.98759402e-03,
        -9.79797316e-03],
       [-4.46386636e-04, -4.42259321e-03,  1.76295051e-03,
         3.11201884e-03, -2.87777472e-03, -3.51990990e-03,
         4.95955679e-03, -3.19023385e-03,  1.11543107e-03,
         1.91272672e-03],
       [ 1.75805536e-03,  1.33123008e-02, -3.66493193e-03,
        -9.70620187e-03,  7.10877199e-03,  9.95318148e-03,
        -1.40811372e-02,  7.89888590e-03, -3.81223338e-03,
        -6.06836473e-03],
       [-1.85111714e-03, -4.82568890e-03, -3.48007590e-03,
         4.51091384e-03,  1.97350412e-03, -1.72758205e-03,
         2.62209535e-03,  2.12765209e-03,  2.71382444e-03,
         3.