## Neural network implementation

In [2]:
import numpy as np
from sklearn import datasets
from copy import deepcopy

### ReLU Layer

In [3]:
class ReLULayer(object):
    def __init__(self):
        
        pass
    """
    forward(): compute the activation function of the layer
    inputs: input_ -> vector -> dot product of the matrix multiplication between the activations of the previos layer
                                and the weights of the current layer
    outputs: relu -> vector
    """
    def forward(self, input_):
        # remember the input for later backpropagation
        self.input = input_
        # return the ReLU of the input
        relu = np.maximum(input_,0)
        return relu
    """
    backward(): compute the derivative of the ReLU activation function w.r.t its input and calculate further the
                backward pass by multiplying with the already computed gradient from the functions following the
                ReLu in the computational graph of our network
    inputs: upstream_gradient -> vector -> the derivative of the LOSS w.r.t. to the current activation 
            self.input -> vector
    outputs: downstream_gradient -> vector -> representing the influence that each input of the current layer
                                              has had on the loss 
    """
    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input of the ReLU Layer\
        # derivative of ReLU w.r.t self.input
        ReLu_gradient = np.ones_like(self.input)
        
        ReLu_gradient[np.where(self.input<=0)] = 0
        
        # LOSS FUNCTION'S gradient w.r.t. relu's input i.e. the Linear function z(W,b,prev_activation)
        downstream_gradient = ReLu_gradient*(upstream_gradient)
        print("ReLU Downstream: ", downstream_gradient.shape)
        return downstream_gradient

    def update(self, learning_rate):
        pass # ReLU is parameter-free

In [4]:
data = datasets.make_blobs(n_features=5,centers=2)
X = data[0]
y = data[1]

### Softmax output layer

In [5]:
# Softmax activation
class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.cross_entropy_loss = 0
        
    """
    forward(): compute the activation function of the layer
    input: logits -> vector -> the dot product of the activations in the penultimate layer and the weights of
                               the current layer
    output: softmax -> vector
    """
    def forward(self, logits):
        # remember the input for later backpropagation
        self.input = logits
        # return the softmax of the input
        # axis = 1 because we need the max logit for each datapoint as axis 0 corresponds to the batch dimension
        
        max_ = np.max(logits,axis=1)[:,None]
        sum_ = np.sum(np.exp((logits-max_)),axis=1)[:,None]
        softmax = np.exp((logits-max_))/ sum_
                      
                      
        return softmax
    
    """
    backward(): compute the derivative of the softmax activation function w.r.t its input and calculate further the
                backward pass by multiplying with the already computed gradient from the cost function w.r.t to the
                predictions
    inputs: predictions -> vector
            self.input -> vector
            y -> number : the true label
    outputs: downstream_gradient -> vector -> representing the influence that each input of the current layer
                                              has had on the LOSS (logically if the input was below 0 then its
                                              influence on the loss will also be 0)
    """
    def backward(self, predictions, y, return_loss = False):
        # return the loss derivative with respect to the stored inputs of the layer 
        # (use cross-entropy loss and the chain rule for softmax,
        #  as derived in the lecture) 
        print("Crossentropy",predictions.shape)
        print("true : ",y.shape)
        if len(predictions.shape)==1:
            predictions = predictions[:,None]
        batch_size = predictions.shape[0]
        
        # cross entropy loss 
        self.cross_entropy_loss = np.mean(-np.log(predictions[range(batch_size),y]+1e-8)) # to make it numerically stable
        
        # derivative of loss w.r.t. the logits : vector  
        downstream_gradient = predictions
        downstream_gradient[range(batch_size),y]-=1
        
        
        # LOSS FUNCTION'S gradient w.r.t. softmax's input i.e. the Linear function z(W,b,prev_activation)
        if return_loss:
            return downstream_gradient, self.cross_entropy_loss
        else: 
            #downstream_gradient = cross_entropy_derivative.dot(softmax_derivative)
            return downstream_gradient

    def update(self, learning_rate):
        pass # softmax is parameter-free

In [6]:
output= OutputLayer(2)
p=output.forward(X[:2])
print("predictions\n",p)
print("y",y[:2])
g=output.backward(p,y[:2])

print("g",g)


predictions
 [[1.77188792e-01 3.10450888e-04 1.54869884e-06 8.22498227e-01
  9.81435359e-07]
 [3.54743257e-03 9.22843383e-06 2.41559594e-05 9.96021613e-01
  3.97570286e-04]]
y [0 1]
Crossentropy (2, 5)
true :  (2,)
g [[-8.22811208e-01  3.10450888e-04  1.54869884e-06  8.22498227e-01
   9.81435359e-07]
 [ 3.54743257e-03 -9.99990772e-01  2.41559594e-05  9.96021613e-01
   3.97570286e-04]]


In [7]:
class LinearLayer(object):
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        # randomly initialize weights and intercepts
        self.B = np.random.uniform(size=(n_inputs,n_outputs)) # your code here
        self.b = np.random.uniform(size=(n_outputs,)) # your code here
        self.grad_B = None
        self.grad_b = None
    """
    forward(): compute the linear function z(W,b,X) = XW + b which will then be passed as parameter to the 
               respective activation function
    input: input_ -> vector -> the activation of the previos layer (or the input X is this is the first layer)
    output: preactivations -> vector : XW + b
    """
    def forward(self, input_):
        # remember the input for later backpropagation
        if len(input_.shape)==1:
            self.input = deepcopy(input_)
            self.input = self.input[:,None]
        else:
            self.input = deepcopy(input_)
        # compute the scalar product of input and weights
        # (these are the preactivations for the subsequent non-linear layer)
        preactivations = self.input.dot(self.B)+self.b
        
        return preactivations
    
    """
    backward(): compute the derivative of the preactivation z(W,b,X) w.r.t its inputs and calculate further the
                backward pass by multiplying with the already computed gradient from the cost function w.r.t to the
                current activation with the computed gradient of the preactivation w.r.t. to the activation in the 
                previous layer
    inputs: upstream_gradient -> vector -> gradient of LOSS w.r.t. to the current activation
            self.B -> matrix -> weights of the current layer
            self.b -> vector -> the bias vector
    outputs: downstream_gradient -> vector -> representing the influence that each input_ of the current layer
                                              (activation of the previous layer) has had on the LOSS
    """
    def backward(self, upstream_gradient):
        # compute the derivative of the weights from upstream_gradient and the stored input
        
        # gradient of the matrix multiplication w.r.t. the biases : equals 1 since all other parameters are
        # constants with respect to the bias. Then we calculate the gradient of the LOSS w.r.t. the bias vector 
        # taking the mean since grad_b is initially a matrix containing the gradient vector for each batch
        self.grad_b = np.mean(1 * upstream_gradient, axis = 0)
        print("input shape of the linear layer : ",self.input.shape)
        print("upstream shape from softmax : ",upstream_gradient.shape)
        print("shape of the weights :",self.B.shape)
        # gradient of the matrix multiplication w.r.t. the weights : equals the inputs for each row of 
        # the weights matrix. We then calculate the gradient of the LOSS w.r.t. each weight in the matrix.
        # We apply mean because the initial grad_B contains the gradients w.r.t. to each weight for each batch
        self.grad_B = np.mean((self.input[:,None,:] * upstream_gradient[:,:,None]),axis=0)
         
        # compute the downstream gradient to be passed to the preceding layer
        # gradient of the LOSS with respect to the input (the activation of the previous layer)
        downstream_gradient = upstream_gradient[:,None].dot(self.B.T)
        print("downstream gradient shape : ",np.reshape(downstream_gradient,(batch_size,downstream_gradient.shape[-1])).shape)
        return np.reshape(downstream_gradient,(batch_size,downstream_gradient.shape[-1]))

    def update(self, learning_rate):
        # update the weights by batch gradient descent
        print("shape of grad_B",self.grad_B.shape)
        print("shape of grad_b",self.grad_b.shape)
        self.B = self.B - learning_rate * self.grad_B.T
        self.b = self.b - learning_rate * self.grad_b.T

In [9]:
# Multi-layer perceptron
class MLP(object):
    def __init__(self, n_features, layer_sizes):
        # constuct a multi-layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem)
        #
        # n_features: number of inputs
        # len(layer_size): number of layers
        # layer_size[k]: number of neurons in layer k
        # (specifically: layer_sizes[-1] is the number of classes)
        self.n_layers = len(layer_sizes)
        self.layers   = []

        # create interior layers (linear + ReLU)
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        # flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)

        # compute the forward pass
        # (implicitly stores internal activations for later backpropagation)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    def backward(self, predictions, true_classes):
        # perform backpropagation w.r.t. the prediction for the latest mini-batch X
        # first compute the gradient of the LOSS w.r.t. the penultimate activation
        dLOSS_dINPUT, current_loss = self.layers[-1].backward(predictions, true_classes, return_loss = True)
        print("Loss: ",current_loss)
        
        # we can then calculate the derivatives of the LOSS w.r.t. all other activations 
        for i in range(len(self.layers)-2, -1, -1):
            print("layer: ",i)
            dLOSS_dINPUT = self.layers[i].backward(dLOSS_dINPUT)
            
            
    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # print("Epoch", i)
            # reorder data for every epoch
            # (i.e. sample mini-batches without replacement)
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # perform one forward and backward pass and update network parameters
                
                self.update(x_batch, y_batch, learning_rate)


In [12]:
# sorry for the output log -> I needed to know the shapes and I thought it will be nice for you to see them too
# for the purposes of correcting :) 
if __name__=="__main__":

    # set training/test set size
    N = 2000

    # create training and test data
    X_train, Y_train = datasets.make_moons(N, noise=0.05)
    X_test,  Y_test  = datasets.make_moons(N, noise=0.05)
    n_features = 2
    n_classes  = 2

    # standardize features to be in [-1, 1]
    offset  = X_train.min(axis=0)
    scaling = X_train.max(axis=0) - offset
    X_train = ((X_train - offset) / scaling - 0.5) * 2.0
    X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0

    # set hyperparameters (play with these!)
    layer_sizes = [30, 30, n_classes]
    n_epochs = 5
    batch_size = 100
    learning_rate = 0.05

    # create network
    network = MLP(n_features, layer_sizes)

    # train
    network.train(X_train, Y_train, 10, batch_size, learning_rate)
    
    # test
    predicted_posteriors = network.forward(X_test)
    print(predicted_posteriors)
    # determine class predictions from posteriors by winner-takes-all rule
    predicted_classes = np.argmax(predicted_posteriors,axis=1)
    print(predicted_classes)# your code here
    # compute and output the error rate of predicted_classes
    print(predicted_classes[1],Y_test[1])
    error_rate = np.sum(np.logical_not(np.equal(predicted_classes,Y_test)))/Y_test.shape[0] # your code here
    print("error rate:", error_rate)

Crossentropy (100, 2)
true :  (100,)
Loss:  0.7721637577281478
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  7.552456837348658
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)


downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  1.6112754753487084
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  0.48495

downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  0.2374905932737558
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient sh

ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  0.21668807827885922
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of g

ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  0.19146228798416282
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of g

downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  0.2037933315078707
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient sh

downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient shape :  (100, 2)
shape of grad_B (30, 2)
shape of grad_b (30,)
shape of grad_B (30, 30)
shape of grad_b (30,)
shape of grad_B (2, 30)
shape of grad_b (2,)
Crossentropy (100, 2)
true :  (100,)
Loss:  0.18654766484459323
layer:  4
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 2)
shape of the weights : (30, 2)
downstream gradient shape :  (100, 30)
layer:  3
ReLU Downstream:  (100, 30)
layer:  2
input shape of the linear layer :  (100, 30)
upstream shape from softmax :  (100, 30)
shape of the weights : (30, 30)
downstream gradient shape :  (100, 30)
layer:  1
ReLU Downstream:  (100, 30)
layer:  0
input shape of the linear layer :  (100, 2)
upstream shape from softmax :  (100, 30)
shape of the weights : (2, 30)
downstream gradient sha