## Artificial Neural Network and Backpropagation
(Understand internal workings of mlp)

In [14]:
import numpy as np
import sys

In [36]:
#functions of non-linear activations
def f_sigmoid(X, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-X))
    else:
        return f_sigmoid(X)*(1 - f_sigmoid(X))


def f_softmax(X):
    Z = np.sum(np.exp(X), axis=1)
    Z = Z.reshape(Z.shape[0], 1)
    return np.exp(X) / Z


def f_relu(X, deriv=False):
    if not deriv:
        return np.maximum(0, X)
    else:
        return np.where(X > 0, 1, 0)   


In [16]:
def exit_with_err(err_str):
    #print >> sys.stderr, err_str
    print(err_str, file=sys.stderr)
    sys.exit(1)

In [5]:
#Functionality of a single hidden layer
class Layer:
    def __init__(self, size, batch_size, is_input=False, is_output=False,
                 activation=f_sigmoid):
        self.is_input = is_input
        self.is_output = is_output

        # Z is the matrix that holds output values
        self.Z = np.zeros((batch_size, size[0]))
        # The activation function is an externally defined function (with a
        # derivative) that is stored here
        self.activation = activation

        # W is the outgoing weight matrix for this layer
        self.W = None
        # S is the matrix that holds the inputs to this layer
        self.S = None
        # D is the matrix that holds the deltas for this layer
        self.D = None
        # Fp is the matrix that holds the derivatives of the activation function
        self.Fp = None

        if not is_input:
            self.S = np.zeros((batch_size, size[0]))
            self.D = np.zeros((batch_size, size[0]))

        if not is_output:
            self.W = np.random.normal(size=size, scale=1E-4)

        if not is_input and not is_output:
            self.Fp = np.zeros((size[0], batch_size))

    def forward_propagate(self):
        if self.is_input:
            return self.Z.dot(self.W)

        self.Z = self.activation(self.S)
        if self.is_output:
            return self.Z
        else:
            # For hidden layers, we add the bias values here
            self.Z = np.append(self.Z, np.ones((self.Z.shape[0], 1)), axis=1)
            self.Fp = self.activation(self.S, deriv=True).T
            return self.Z.dot(self.W)


In [65]:
class MultiLayerPerceptron:
    def __init__(self, layer_config, batch_size=100, activation_hidden=f_sigmoid ):
        self.layers = []
        self.num_layers = len(layer_config)
        self.minibatch_size = batch_size

        for i in range(self.num_layers-1):
            if i == 0:
                print ("Initializing input layer with size {0}.".format(layer_config[i]))
                # Here, we add an additional unit at the input for the bias
                # weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         is_input=True))
            else:
                print ("Initializing hidden layer with size {0}.".format(layer_config[i]))
                # Here we add an additional unit in the hidden layers for the
                # bias weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         activation=activation_hidden))

        print ("Initializing output layer with size {0}.".format(layer_config[-1]))
        self.layers.append(Layer([layer_config[-1], None],
                                 batch_size,
                                 is_output=True,
                                 activation=f_softmax))
        print ("Done!")

    def forward_propagate(self, data):
        # We need to be sure to add bias values to the input
        self.layers[0].Z = np.append(data, np.ones((data.shape[0], 1)), axis=1)

        for i in range(self.num_layers-1):
            self.layers[i+1].S = self.layers[i].forward_propagate()
        return self.layers[-1].forward_propagate()

    def backpropagate(self, yhat, labels):
        
        #exit_with_err("FIND ME IN THE CODE, What is computed in the next line of code?\n")
        # Ans: It calculates the error (predicted - truth)

        self.layers[-1].D = (yhat - labels).T  #  D is matrix holding deltas. calculate the delta error (predicted - truth) from output and transpose
        for i in range(self.num_layers-2, 0, -1):  # For each layer starting from second last layer, going backward layer by layer
            # We do not calculate deltas for the bias values
            W_nobias = self.layers[i].W[0:-1, :] # Collect all weights except the bias (last in row) for the current layer
            
            #exit_with_err("FIND ME IN THE CODE, What does this 'for' loop do?\n")
            # Ans: In summary, it propogates error through the network in backward direction
            #      For each layer (except the last layer), it calculates the gradient of the loss from next layer with respect to the 
            #      weights of current layer (excluding the bias) and the derivative of the activation function of current layer. 
            #      The result from this is then used to update the weights of the network such that loss is minimised.
                        
            # W_nobias.dot(self.layers[i+1].D) - weigheted sum of error from the next layer
            # *  - Dot product which is element wise multiplication
            # self.layers[i].Fp - derivative of the activation function of the current layer
            self.layers[i].D = W_nobias.dot(self.layers[i+1].D) * self.layers[i].Fp # Updates the delta, 
                    
    def update_weights(self, eta):
        # For each layer, except for the output layer
        for i in range(0, self.num_layers-1):
            # -eta - eta is learning rate that determines by what factor the weights should be updated.. 
            #        Negative sign denotes that we move in negative direction that reduces the error
            # self.layers[i+1].D - Error of the next layer that was calculated during the back propogation
            # self.layers[i].Z - ouput of the current layer that is after applying the activation function (during forward pass). 
            #                    its also the input to the next layer
            # Dot product of error from next layer and output of current layer factored by learnng rate (-eta) gives us weight gradients of the current layer
            
            W_grad = -eta*(self.layers[i+1].D.dot(self.layers[i].Z)).T
            self.layers[i].W += W_grad
            #print(f'Layer : {i}, Gradients : {W_grad}')

    def evaluate(self, train_data, train_labels, test_data, test_labels,
                 num_epochs=70, eta=0.05, eval_train=False, eval_test=True):

        N_train = len(train_labels)*len(train_labels[0])
        N_test = len(test_labels)*len(test_labels[0])

        print ("Training for {0} epochs...".format(num_epochs))
        for t in range(0, num_epochs):
            out_str = "[{0:4d}] ".format(t)

            for b_data, b_labels in zip(train_data, train_labels):
                output = self.forward_propagate(b_data)
                self.backpropagate(output, b_labels)
                
                #exit_with_err("FIND ME IN THE CODE, How does weight update is implemented? What is eta?\n")
                # Ans: Detailed answer in update_weights fuction 
                
                # Dot product of error from next layer and output of current layer factored by learnng rate (-eta) 
                # gives us weight gradients of the current layer                
                self.update_weights(eta=eta) #eta is defaulted to 0.05

            if eval_train:
                errs = 0
                corrects = 0
                for b_data, b_labels in zip(train_data, train_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])
                    corrects += np.sum(b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Training error: {1:.5f} Training accuracy: {2:.2f}".format(out_str,
                                                           float(errs)/N_train, float(corrects)/N_train*100))

            if eval_test:
                errs = 0
                corrects = 0
                for b_data, b_labels in zip(test_data, test_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])
                    corrects += np.sum(b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Test error: {1:.5f}  Test accuracy: {2:.2f}").format(out_str,
                                                       float(errs)/N_test, float(corrects)/N_test*100)

            print (out_str)


In [7]:
def label_to_bit_vector(labels, nbits):
    bit_vector = np.zeros((labels.shape[0], nbits))
    for i in range(labels.shape[0]):
        bit_vector[i, labels[i]] = 1.0

    return bit_vector

In [8]:
def create_batches(data, labels, batch_size, create_bit_vector=False):
    N = data.shape[0]
    print ("Batch size {0}, the number of examples {1}.".format(batch_size,N))

    if N % batch_size != 0:
        print ("Warning in create_minibatches(): Batch size {0} does not " \
              "evenly divide the number of examples {1}.".format(batch_size,N))
    chunked_data = []
    chunked_labels = []
    idx = 0
    while idx + batch_size <= N:
        chunked_data.append(data[idx:idx+batch_size, :])
        if not create_bit_vector:
            chunked_labels.append(labels[idx:idx+batch_size])
        else:
            bit_vector = label_to_bit_vector(labels[idx:idx+batch_size], 10)
            chunked_labels.append(bit_vector)

        idx += batch_size

    return chunked_data, chunked_labels


In [9]:
def prepare_for_backprop(batch_size, Train_images, Train_labels, Valid_images, Valid_labels):
    
    print ("Creating data...")
    batched_train_data, batched_train_labels = create_batches(Train_images, Train_labels,
                                              batch_size,
                                              create_bit_vector=True)
    batched_valid_data, batched_valid_labels = create_batches(Valid_images, Valid_labels,
                                              batch_size,
                                              create_bit_vector=True)
    print ("Done!")


    return batched_train_data, batched_train_labels,  batched_valid_data, batched_valid_labels



In [10]:
from keras.datasets import mnist

In [12]:
(Xtr, Ltr), (X_test, L_test)=mnist.load_data()

Xtr = Xtr.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
Xtr = Xtr.astype('float32')
X_test = X_test.astype('float32')
Xtr /= 255
X_test /= 255
print(Xtr.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')


60000 train samples
10000 test samples


### Training with suggested configuration of the hyperparameters: number of epochs = 70 and learning rate =0.05
- Results at last epoch - Training error: 0.00000, Training accuracy: 100.00, Test error: 0.02570,  Test accuracy: 97.43

In [48]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True)

print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.40842 Training accuracy: 59.16 Test error: 0.40290  Test accuracy: 59.71
[   1]  Training error: 0.07910 Training accuracy: 92.09 Test error: 0.07720  Test accuracy: 92.28
[   2]  Training error: 0.04832 Training accuracy: 95.17 Test error: 0.05200  Test accuracy: 94.80
[   3]  Training error: 0.04117 Training accuracy: 95.88 Test error: 0.04750  Test accuracy: 95.25
[   4]  Training error: 0.03438 Training accuracy: 96.56 Test error: 0.04190  Test accuracy: 95.81
[   5]  Training error: 0.02998 Training accuracy: 97.00 Test error: 0.03800  Test accuracy: 96.20
[   6]  Training error: 0.02935 Training accuracy: 97.06 Test error: 0.04110  Test accuracy: 95.8

### Training number of epochs = 70 and learning rate =0.005
- Results at last epoch - Training error: 0.00055 Training accuracy: 99.94 Test error: 0.02440  Test accuracy: 97.56
- Bit slow learning. Error would further reduce with more epochs.

In [49]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True,eta=0.005)

print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.70337 Training accuracy: 29.66 Test error: 0.70060  Test accuracy: 29.94
[   1]  Training error: 0.64692 Training accuracy: 35.31 Test error: 0.64320  Test accuracy: 35.68
[   2]  Training error: 0.59950 Training accuracy: 40.05 Test error: 0.59810  Test accuracy: 40.19
[   3]  Training error: 0.45178 Training accuracy: 54.82 Test error: 0.46030  Test accuracy: 53.97
[   4]  Training error: 0.20880 Training accuracy: 79.12 Test error: 0.20030  Test accuracy: 79.97
[   5]  Training error: 0.10963 Training accuracy: 89.04 Test error: 0.10770  Test accuracy: 89.23
[   6]  Training error: 0.08838 Training accuracy: 91.16 Test error: 0.08670  Test accuracy: 91.3

### Training number of epochs = 70 and learning rate =0.5
- Results at last epoch - Training error: 0.90248 Training accuracy: 9.75 Test error: 0.90260  Test accuracy: 9.74
- Since 0.5 is high LR, weights are updated at larger rate so model fails to find the optimal weights. Thus model doesnot converge. Instead loss and accuracy keeps fluctuating.

In [50]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True,eta=0.5)

print("Done:)\n")

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   1]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   2]  Training error: 0.90248 Training accuracy: 9.75 Test error: 0.90260  Test accuracy: 9.74
[   3]  Training error: 0.90085 Training accuracy: 9.92 Test error: 0.89910  Test accuracy: 10.09
[   4]  Training error: 0.90965 Training accuracy: 9.04 Test error: 0.91080  Test accuracy: 8.92
[   5]  Training error: 0.89782 Training accuracy: 10.22 Test error: 0.89900  Test accuracy: 10.10
[   6]  Training error: 0.90263 Training accuracy: 9.74 Test error: 0.90180  Test accuracy: 9.82
[   7

### Training with ReLU as output function in hidden layers and default configuration: number of epochs = 70 and learning rate =0.05


- Results at last epoch -  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
- Error and accuracy remains same from first epoch to last epoch.
- Model doesnot converge 

In [55]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size, activation_hidden = f_relu)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True,eta=0.05)

print("Done:)\n")

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   1]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   2]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   3]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   4]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   5]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   6]  Training error: 0.90137 Training accuracy: 9.86 Test error: 0.90420  Test accuracy: 9.58
[   7]  Trai

### Training with ReLU as output function in hidden layers and configuration: number of epochs = 70 and learning rate =0.005

 
- Results ate final epoch -  Training error: 0.00002 Training accuracy: 100.00 Test error: 0.02030  Test accuracy: 97.97
- With ReLU as ouput function and Learning rate 0.005, we achieve same accuracy as we get with sigmoid output function with learning rate 0.05
- Convergence starts bit earlier at 19 epoch as compared to 51 epoch when learning rate of 0.001 was used (see below for tests with lr 0.001).

In [56]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size, activation_hidden = f_relu)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True,eta=0.005)

print("Done:)\n")

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   1]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   2]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   3]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   4]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   5]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.28
[   6]  Training error: 0.89558 Training accuracy: 10.44 Test error: 0.89720  Test accuracy: 10.2

### Training with ReLU as output function in hidden layers and configuration: number of epochs = 70 and learning rate =0.001


- Results at final epoch -  Training error: 0.01837 Training accuracy: 98.16 Test error: 0.03580  Test accuracy: 96.42
- Error and accuracy remains same uptil the 50th epoch
- Only from 51 epochs loss started reducing 

In [54]:
batch_size=100;

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size, activation_hidden = f_relu)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=True,eta=0.001)

print("Done:)\n")

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   1]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   2]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   3]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   4]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   5]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.35
[   6]  Training error: 0.88763 Training accuracy: 11.24 Test error: 0.88650  Test accuracy: 11.3