In [1]:
import numpy as np

In [2]:
class ActivationFunction:
    def __init__(self, fname="ReLU"):
        self._foward, self._backward = ActivationFunction.get_functions(fname)
        self._buffer = None
    
    def foward(self, data):
        if self._buffer:
            print("No call to backward after previous foward call.")
        self._buffer = data
        return self._foward(data)
    
    def backward(self, grad):
        delta = grad * self._backward(self._buffer)
        self._buffer = None
        return delta
    
    @staticmethod
    def get_functions(fname):
        if fname == "ReLU":
            return (
                lambda x: x*(x>0), # function
                lambda x: 1*(x>0)  # gradient
            )
        else:
            raise ValueError(f"Invalid Activation Function: {fname}")

In [3]:
class LossFunction:
    def __init__(self, fname="MSE"):
        self._foward, self._backward = LossFunction.get_functions(fname)
        self._buffer = None
        
    def foward(self, pred, label):
        if self._buffer:
            print("No call to backward after previous foward call.")
        self._buffer = (pred, label)
        return self._foward(pred, label)
    
    def backward(self):
        delta = self._backward(*self._buffer)
        self._buffer = None
        return delta
        
    @staticmethod
    def get_functions(fname):
        if fname == "MSE":
            return (
                lambda o, y: np.sum((o - y)**2) / (2*o.shape[0]), # function
                lambda o, y: (o - y) / o.shape[0]                 # gradient
            )
        else:
            raise ValueError(f"Invalid Activation Function: {fname}")

In [4]:
class Optimizer:
    def __init__(self, eta=1e-3, l2_coeff=0.01, alpha=0.3):
        def optimize(old, grad, old_delta):
            new_delta = -eta*grad + alpha*old_delta - 2*l2_coeff*old
            return new_delta
        self.optimize = optimize

In [5]:
class LinearLayer:
    def __init__(self, shape):
        self.weights = np.random.rand(*shape[::-1])
        self.bias = np.random.rand(shape[1])
        # gradient
        self.weights_gradient = np.empty(shape[::-1])
        self.bias_gradient = np.empty(shape[1])
        # old update (momentum)
        self.weights_delta = np.empty(shape[::-1])
        self.bias_delta = np.empty(shape[1])
        
    def foward(self, data):
        self._buffer = data
        output = data @ self.weights.T + self.bias.T
        return output
    
    def backward(self, output_gradient):
        self.bias_gradient[:] = output_gradient.sum(axis=0)
        self.weights_gradient[:] = output_gradient.T @ self._buffer
        self._buffer = None
        input_gradient = (output_gradient @ self.weights)
        return input_gradient
        
    def update(self, weights_delta, bias_delta):
        self.weights[:] = self.weights + weights_delta
        self.bias[:] = self.bias + bias_delta
        self.weights_delta[:] = weights_delta
        self.bias_delta[:] = bias_delta

In [6]:
class NeuralNetwork:
    def __init__(self, net, loss=LossFunction(), optimizer=Optimizer()):
        NeuralNetwork.check_network(net)
        self.net = net
        self.loss = loss
        self.optimizer = optimizer
        self._buffer = None
        
    def foward(self, data, label):
        if self._buffer:
            print("No call to backward after previous foward call.")
        out = data
        for layer in self.net:
            out = layer.foward(out)
        return self.loss.foward(out, label), out
    
    def backward(self):
        grad = self.loss.backward()
        for layer in self.net[::-1]:
            grad = layer.backward(grad)
        return grad
    
    @staticmethod
    def check_network(net):
        expected_layer_type = LinearLayer
        for i, layer in enumerate(net):
            if not isinstance(layer, expected_layer_type):
                raise ValueError(f"layer #{i} is of type {type(layer)} expected type is {expected_layer_type}")
            expected_layer_type = ActivationFunction if expected_layer_type == LinearLayer else LinearLayer

In [None]:
import torch

In [None]:
tnet = torch.nn.Sequential(
    torch.nn.Linear(8, 16),
    torch.nn.ReLU(),
    torch.nn.Linear(16, 16),
    torch.nn.ReLU(),
    torch.nn.Linear(16, 2)
)
tloss = torch.nn.MSELoss()

In [None]:
tdata = torch.rand(10, 8, requires_grad=True)
tlabel = torch.rand(10, 2)
tpred = tnet(tdata)
terror = tloss(tpred, tlabel)

In [None]:
terror

In [None]:
tpred

In [None]:
terror.backward()
tdata.grad

In [None]:
net = NeuralNetwork([
    LinearLayer((8,16)),
    ActivationFunction(),
    LinearLayer((16,16)),
    ActivationFunction(),
    LinearLayer((16, 2))
])

In [None]:
# force same weights for both networks
for (layer, tlayer) in zip(net.net, tnet):
    if isinstance(layer, LinearLayer):
        layer.weights[:] = tlayer.weight.detach().numpy()
        layer.bias[:] = tlayer.bias.detach().numpy()

In [None]:
data = tdata.detach().numpy()
label = tlabel.numpy()
loss, pred = net.foward(data, label)

In [None]:
loss

In [None]:
pred

In [None]:
grad = net.backward()
grad

In [None]:
np.allclose(grad, tdata.grad.numpy())