Before diving into implementing Backward propagation trought time, I decided to implement first backward propagation and then go to the step where we can implement it thought time.

I will be using [this](https://blog.zhaytam.com/2018/08/15/implement-neural-network-backpropagation/) tutorial and the any other ressource I will find on the internet.

In [1]:
import numpy as np
np.random.seed(42)

In [2]:
class Layer:
    """
    represent a layer of our neural network
    """
    
    def __init__(self, n_input, neurons, weights=None, biais=None, activation=None):
        """
        n_input : the numbers of input we pass to our network
        neurons : the numbers of neurons in this layer
        weights : the layer weights
        biais   : the layer bias
        
        """
        self.weights = weights if weights else np.random.rand(n_input, neurons)
        self.biais = biais if biais else np.random.rand(neurons)
        self.activation = activation

In [3]:
hidden_layer_1 = Layer(3, 4)

In [60]:
hidden_layer_1.weights.dtype

dtype('float64')

In [5]:
hidden_layer_1.biais

array([0.83244264, 0.21233911, 0.18182497, 0.18340451])

####  Activation

Apply the following activation function to the code :

$\sigma(X*W +B)$

In [71]:
def activate(self, x):
    dot_product = np.dot(x, self.weights) + self.biais
    last_activation = self._apply_activation(dot_product)
    return dot_product, last_activation

In [72]:
Layer.activate = activate

In [51]:
def _apply_activation(self, normal):
    """
    apply the activation function to the value pass in parameter
    """
    if self.activation is None:
        return normal
    elif self.activation == 'tanh':
        return np.tanh(normal)
    elif self.activation == 'sigmoid':
        return 1 / (1 + np.exp(normal))
    ### what happen to relu?
    return normal
    

In [52]:
Layer._apply_activation = _apply_activation

In [53]:
hidden_layer_1.activate(np.array([1, 2, 3]))



array([3.32236508, 3.59926019, 1.09173962, 5.42414484])

Let us add our layers to our network and build it

In [54]:
class NeuralNetwork:
    """
    represent a neural network
    """
    def __init__(self):
        self._layers = []
    
    def add_layer(self, layer):
        """
        add a layer to the network
        layer : the layer to add to the network
        """
        self._layers.append(layer)
    
    def feed_foward(self, X):
        for layer in self._layers:
            X = layer.activate(X)
        return X
    
    def predict(self, X):
        """
        predict a class or class for multi ouput
        """
        
        outputs = self.feed_foward(X)
        
        if outputs.ndim == 1:
            return np.argmax(outputs)
        return np.argmax(outputs, axis=1)

In [66]:
the_network = NeuralNetwork()

In [67]:
the_network.add_layer(Layer(2, 3, activation='tanh'))
the_network.add_layer(Layer(3, 3, activation='sigmoid'))
the_network.add_layer(Layer(3, 2, activation='sigmoid'))

Let try to see how our network predict a binary operation

In [68]:
the_network.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))

array([1, 1, 1, 1])

In [32]:
hidden_layer_1.weights.shape

(3, 4)

In [42]:
np.array([[1, 2, 3]])

(1, 3)

In [43]:
np.dot(np.array([[1, 2, 3]]), hidden_layer_1.weights)

array([[2.48992243, 3.38692108, 0.90991465, 5.24074033]])

### Training Phase and Backward Propagation

Let define a function that calculated sigmoid derivate

In [69]:
def cost_derivative(self, output_activations, y):
    """Return the vector of partial derivatives \partial C_x 
    \partial a for the output activations.
    this is in case our activation function
    """
    return (output_activations-y)

In [70]:
NeuralNetwork.cost_derivative = cost_derivative

In [73]:
def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))

This implementation comes from this book on chapter 2 http://neuralnetworksanddeeplearning.com/chap2.html

In [74]:
def back_propagate(self, x, y):
    """
    return the the tuples with nabla_w, nabla_b representing the gradient of the cost 
    function layer by layer
    which are the derivate of the cost function with the respect to w and b for each layer
    """
    nabla_b = [np.zeros(layer.biais.shape) for layer in self.layers]
    nabla_w = [np.zeros(layer.weights.shape) for layer in self.layers]
    
    # feedforward....
    
    activation = x
    activations = [activation]
    z_s = [] # will store z layer by layer 
    
    for layer in self.layers:
        z , activation = layer.activate(x)
        z.append(z)
        activations.append(activation)
    delta = self.cost_derivative(activation[-1], y) * sigmoid_prime(z_s[-1])
    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose()) # from the four equation
    
    ### back propagate the error 
    for layer, l in enumerate(reversed(self.layers)):
        z = z[-l] # the last z
        sp = sigmoid_prime(z)
        delta = np.dot(layer.weights, delta) * sp #delta minus one 
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-l-1].transpose()) 
    return (nabla_b, nabla_w)