# Deep Learning Assignment 01

Submitted By: <br>
Name: G Rohit <br>
Student Id: 19233292 <br>

Importing the required libraries and defining a method to load any existing model

In [1]:
import numpy as np
import pickle, random, csv
random.seed(1026847926404610461)

In [24]:
def load(model_file):
    """
    Load a dumped neural network model
    
    Parameters: 
    model_file: filename of the dumped network object
    Return: the object of the network.
    """
    return pickle.load(open(model_file))

### Part 1: Implementation of Fully Connected Feed Forward Neural Network

The given code is an implementation of Fully Connected Feed Forward Neural Network with Back Propagation with the following given characteristics: 
- The model is implemented using the reference of class learning material for Deep Learning
- Has only one hidden layer and it accepts variable number of input node, hidden layer node and output nodes
- the user can provide the values for learning rate(0.01 by default) and the activation function(relu by default, can accept relu, sigmoid and tanh as activation function) to the model
- Uses HE Initialization to initialize the weights of the nodes
- Uses softmax as the activation function for the output layer as it returns the probability of each output class 
- The model is optimised using Stochastic Gradient Descent for convergence of loss function
- Model is saved after training 

In [3]:
class FullyConnectedFeedForwardNN(object):
    """
    Implementation of a Fully connected feed forward Neural Network. 
    This implementation implements only one hidden layer.
    """
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01, activation = 'relu'):
        """
        Constructor for the network. This function initializes the weight matrices using HE initialization
        and sets up necessary variables to be used while tarining and prediction
        
        Parameters:
        input_size: the number of input neurons
        hidden_size: the number of hidden neurons
        output_size: the number of output neurons i.e. number of classes in the data
        learning_rate: the learning_rate used while training the weights. Default value = 0.01
        activation: the activation function to be used in the hidden layer of the network. Default: 'relu'
        
        Returns: None
        """
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.W_xh = np.random.randn(self.hidden_size, self.input_size) * np.sqrt(2/(self.input_size)) # Weight for input layer to hidden layer
        self.W_hy = np.random.randn(self.output_size, self.hidden_size) * np.sqrt(2/(self.hidden_size)) # Weight for hidden layer to output layer
        self.bias_h = np.zeros((self.hidden_size, 1)) # hidden layer bias
        self.bias_y = np.zeros((self.output_size, 1)) # output layer bias
        self.learning_rate = learning_rate
        self.activations = {'relu': self._relu, 'sigmoid': self._sigmoid, 'tanh': self._tanh}
        self.derivative_activations = {'relu': self._relu_derivative, 'sigmoid': self._sigmoid_derivative, 'tanh': self._tanh_derivative}
        try:
            self.activation_fn = self.activations[activation]
            self.activation_fn_derivative = self.derivative_activations[activation]
        except:
            print("Supports : 'sigmoid', 'tanh', 'relu' activations only. Please use only one of these.")
    
    #activation functions 
    def _relu(self,Z):
        return np.maximum(Z, 0)
    def _tanh(self,Z):
        return np.tanh(Z)
    def _sigmoid(self,Z):
        return 1/(1+np.exp(-Z))
    #derivative of activation functions
    def _relu_derivative(self,Z):
        Z[Z<=0] = 0
        Z[Z>0] = 1
        return Z
    def _tanh_derivative(self,Z):
        return (1 - Z * Z)
    def _sigmoid_derivative(self,Z):
        return Z*(1-Z)

    def _forward_propagation(self, X):
        """
        This function performs forward propagation 
        Parameter:
        X: input
        
        Returns: hidden layer activations values, final softmax probs 
        """
        Z = np.dot(self.W_xh, np.reshape(X,(len(X),1))) + self.bias_h
        hidden_activations = self.activation_fn(Z) # perform non-linearity on the input data
        y_s = np.exp(np.dot(self.W_hy, hidden_activations) + self.bias_y)
        prob_values = y_s/np.sum(y_s) #softmax values
        return hidden_activations, prob_values
    
    def _back_propagation(self, X, target_class, hidden_activations, prob_values):
        """
        This function implements backward propagation i.e. calculate error in weights in each layer.
        
        Parameters:
        X: input
        target_class: output target class
        hidden_activations: hidden activation from forward pass
        prob_values: softmax probabilities of output from forward pass
        
        Returns: error in weights matrices and error in biases
        """
        delta_W_xh, delta_W_hy = np.zeros_like(self.W_xh), np.zeros_like(self.W_hy)
        delta_bias_h, delta_bias_y = np.zeros_like(self.bias_h), np.zeros_like(self.bias_y)
        
        #error calculation for the weight matrix of hidden layer to output layer and bias of output layer
        delta_y = np.copy(prob_values)
        delta_y[target_class] -= 1
        delta_W_hy = np.dot(delta_y, hidden_activations.T)
        delta_bias_y += delta_y
        
        #error calculation for the weight matrix of input layer to hidden layer and bias of hidden layer
        delta_h = np.dot(self.W_hy.T, delta_y)
        delta_h_error = self.activation_fn_derivative(hidden_activations) * delta_h 
        delta_bias_h += delta_h_error
        delta_W_xh += np.dot(delta_h_error, np.reshape(X, (len(X), 1)).T)
        
        return delta_W_xh, delta_W_hy, delta_bias_h, delta_bias_y
    
    def _update_weights(self, delta_W_xh, delta_bias_h, delta_W_hy, delta_bias_y):
        """
        Update the weights and biases of the network
        
        Parameters:
        dWxh: error for weight matrix from input layer to hidden layer
        dbh: error for bias for hidden layer
        dWhy: error for weight matrix from hidden layer to output layer
        dby: error for bias for output layer
        
        Returns: None
        """
        self.W_xh -= self.learning_rate * delta_W_xh
        self.bias_h -= self.learning_rate * delta_bias_h
        self.W_hy -= self.learning_rate * delta_W_hy
        self.bias_y -= self.learning_rate * delta_bias_y
        
    def _calc_mean_loss(self, total_loss, num_samples):
        """
        Calculate the mean loss for the current epoch
        
        Parameters:
        total_loss: sum of all loss calculated for one epoch
        num_samples: total number of training sample
        
        Returns: mean loss for an epoch
        """
        return 1./num_samples * total_loss

    def train(self, inp, targets, num_epochs,model_file = "NNModel.pkl"):
        """
        This function trains the network i.e. by doing a forward pass then a backward prop and then subsequently 
        update the weights with the errors calculated in the backward pass
        
        Parameters:
        inp: list of input samples
        targets: list of corresponding training output classes 
        num_epochs: number of epochs for training the network
        model_file: filename of the pickle to save the model after training
        
        Returns: None
        """
        for epoch_no in range(num_epochs):
            total_loss = 0
            for inp_sample_no in range(len(inp)): #looping through each input sample
                # forward propagation
                hidden_activations, prob_values = self._forward_propagation(inp[inp_sample_no])
                total_loss += -np.log(prob_values[targets[inp_sample_no], 0])

                # backward propagation
                delta_W_xh, delta_W_hy, delta_bias_h, delta_bias_y = self._back_propagation(inp[inp_sample_no], targets[inp_sample_no], hidden_activations, prob_values)

                # update the weights of the model with the error calculated in back prop
                self._update_weights(delta_W_xh, delta_bias_h, delta_W_hy, delta_bias_y)
                
            print("Epoch ", epoch_no, ": Loss: ", self._calc_mean_loss(total_loss, len(inp) ))
            
        self.save(model_file)

    def predict(self, X):
        """
        This function predicts the output class i.e. performs forward propagations and returns the class 
        with maximum probabilty from the softmax(output layer) layer probs. 
        
        Parameters:
        X: input to test
        
        Return: the predicted output class
        """
        hidden_activations, prob_values = self._forward_propagation(X)
        return np.argmax(prob_values)

    def save(self, model_filename):
        """
        This function dumps the model to a file. So that it can loaded later.
        
        Parameters:
        model_filename: filename of the pickled model
        
        Returns: None
        """
        pickle.dump(self, open(model_filename, 'wb'))

### Part 2: Testing the Implementation on circles dataset

The model implemented above is tested on the circles dataset in the following steps: 
- The input file is read and parsed line by line to create an array containing the input values in form (x1, x2) and the output class labels
- The model implemented above is run with 2 input and 4 hidden layers
- The values of learning rate and activation function used is 0.01 and relu respectively
- The data is split in 90:10 ratio as training and testing sets and the model is trained for 50 epochs

In [4]:
file = open("circles500.csv")
index = 0
circles_data = []
for line in file :
    if index == 0:
        index +=1
        continue
    x0,x1,output_class = line.split(',')
    current_row = {}
    inp = np.asarray([float(x0),float(x1)])
    current_row["inp"] = inp
    current_row["out"] = int(output_class)
    circles_data.append(current_row)

Train and test Neural network on circles dataset

In [5]:
nn=FullyConnectedFeedForwardNN(2,4,2)
inputs=[]
targets=[]
num_epocs, counter = 50, 0
training_size = int(0.9*len(circles_data))
for i in range(0,len(circles_data)):
    targets.append(circles_data[i]['out'])
    inputs.append(circles_data[i]['inp'])
nn.train(inputs[:training_size], targets[:training_size], num_epocs, model_file="Circles_NN_Model.pkl")
counter=0
for j in range(len(circles_data[training_size:])):
    s=nn.predict(circles_data[j+training_size]['inp'])
    s1=circles_data[j+training_size]['out']
    if s == s1:
        counter+=1
print("Accuracy : ",((counter*1.0)/(j+1))*100)

Epoch  0 : Loss:  0.6060232463689772
Epoch  1 : Loss:  0.5480278683347299
Epoch  2 : Loss:  0.4972651738582458
Epoch  3 : Loss:  0.44399671793666684
Epoch  4 : Loss:  0.3915368449019963
Epoch  5 : Loss:  0.345542884166262
Epoch  6 : Loss:  0.3065142274357497
Epoch  7 : Loss:  0.27397224700802353
Epoch  8 : Loss:  0.24662720399658802
Epoch  9 : Loss:  0.22405241001766987
Epoch  10 : Loss:  0.20541895035739655
Epoch  11 : Loss:  0.18971459508026461
Epoch  12 : Loss:  0.17629199866485634
Epoch  13 : Loss:  0.1648290627192087
Epoch  14 : Loss:  0.15481255480031708
Epoch  15 : Loss:  0.14596609099527788
Epoch  16 : Loss:  0.13821752393988523
Epoch  17 : Loss:  0.1312775958929311
Epoch  18 : Loss:  0.1251074803601537
Epoch  19 : Loss:  0.11952280357515518
Epoch  20 : Loss:  0.11441875514113833
Epoch  21 : Loss:  0.10984076427771625
Epoch  22 : Loss:  0.10560211721126085
Epoch  23 : Loss:  0.10174991798656308
Epoch  24 : Loss:  0.09817287067108932
Epoch  25 : Loss:  0.09486164916578625
Epoch 

### Part 3: Testing the implementation on CIFAR-10 dataset

The implementation is now tested on CIFAR-10 data set to distinguish **cats and deers** using the following steps:
- The data has ~1000 images for a class in each batch, we will use all the 5 training batches to train the model(1000x5x2), i.e., 10000 images and will test on the test set(1000x2) of 2000 images
- All the deer and cat samples from the input and the test batches are read, converted to grayscale and normalised for image classification
- Each image is 32x32, hence there are 1024 input nodes in the neural network, 8 nodes in the hidden layer
- Sigmoid function is used as the activation function for the hidden layer
- Number of epochs used is 50

In [4]:
folder = "cifar-10-batches-py"
train_files = [folder+"/data_batch_"+str(i) for i in range(1,6) ]
test_file = folder+"/test_batch"
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))
def Convert_rgb_to_grayscale_and_normalize(image_vector):
    grayscale_vector = []
    individual_spec_length = int(len(image_vector)/3) 
    for i in range(individual_spec_length):
        red_value = image_vector[i]
        green_value = image_vector[i + individual_spec_length]
        blue_value = image_vector[i+ (2*individual_spec_length)]
        # New grayscale image = ( (0.3 * R) + (0.59 * G) + (0.11 * B) ).
        grayscale_value = ((0.3*red_value) + (0.59*green_value) + (0.11*blue_value))
        grayscale_vector.append(grayscale_value)
    return NormalizeData(np.asarray(grayscale_vector))
# This function taken from the CIFAR website
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

deer_samples, cat_samples = {}, {}
deer_samples['train'], cat_samples['train'] = [], []
cat_samples['test'], deer_samples['test'] = [], []
for file in train_files :
    train_data = unpickle(file)
    for i in range(len(train_data[b'labels'])):
        if train_data[b'labels'][i] == 3:
            cat_samples['train'].append(Convert_rgb_to_grayscale_and_normalize(train_data[b'data'][i].tolist()))
        if train_data[b'labels'][i] == 4:
            deer_samples['train'].append(Convert_rgb_to_grayscale_and_normalize(train_data[b'data'][i].tolist()))

test_data = unpickle(test_file)
for i in range(0,len(test_data[b'labels'])):
    if test_data[b'labels'][i] == 3:
        cat_samples['test'].append(Convert_rgb_to_grayscale_and_normalize(test_data[b'data'][i].tolist()))
    if test_data[b'labels'][i] == 4:
        deer_samples['test'].append(Convert_rgb_to_grayscale_and_normalize(test_data[b'data'][i].tolist()))
pickle.dump(cat_samples, open("cat_samples.pkl", "wb"))
pickle.dump(deer_samples, open("deer_samples.pkl", "wb"))

Train and test CIFAR dataset with neural network

In [9]:
nn=FullyConnectedFeedForwardNN(1024,8,2, activation = 'relu')
inputs, targets=[], []
num_epocs, counter = 50, 0
for i in range(len(cat_samples['train'])):
    inputs.append(cat_samples['train'][i])
    targets.append(0)
    inputs.append(deer_samples['train'][i])
    targets.append(1)
nn.train(inputs, targets, num_epocs)
for j in range(0,len(deer_samples['test'])):
    s=nn.predict(deer_samples['test'][j])
    if s == 1 :
        counter+=1
    s=nn.predict(cat_samples['test'][j])
    if s == 0 :
        counter+=1
print("Correctly identified test cases : ",counter)
print("Accuracy : ",((counter*1.0)/((j+1)*2))*100)

Epoch  0 : Loss:  0.6653246366807261
Epoch  1 : Loss:  0.6385096703634113
Epoch  2 : Loss:  0.6148030954550864
Epoch  3 : Loss:  0.5982069157624755
Epoch  4 : Loss:  0.5910848693283969
Epoch  5 : Loss:  0.5862203408074587
Epoch  6 : Loss:  0.5810175738748422
Epoch  7 : Loss:  0.5766272481261265
Epoch  8 : Loss:  0.5732648201258922
Epoch  9 : Loss:  0.5691692625226028
Epoch  10 : Loss:  0.5647321186504732
Epoch  11 : Loss:  0.562268680200155
Epoch  12 : Loss:  0.5597859383598162
Epoch  13 : Loss:  0.5573255726403048
Epoch  14 : Loss:  0.5553178355402721
Epoch  15 : Loss:  0.5535307179908535
Epoch  16 : Loss:  0.5505375014556831
Epoch  17 : Loss:  0.5487360964572647
Epoch  18 : Loss:  0.5469050562266669
Epoch  19 : Loss:  0.5444993439414461
Epoch  20 : Loss:  0.5425490541740479
Epoch  21 : Loss:  0.5411646665952998
Epoch  22 : Loss:  0.5391945893512248
Epoch  23 : Loss:  0.5379617289838875
Epoch  24 : Loss:  0.5367014717226369
Epoch  25 : Loss:  0.5353440257566191
Epoch  26 : Loss:  0.53

### Part 4: Enhancement Implementation - L2 regularizing

To address the problem of overfitting and increase the model's performance, regularizing weights using L2 Regularization is implemented, as more training data cannot be provided to the model. In L2 regularization high weight values are penalised, reducing the activation values of the non-linear layer and hereby reducing the complexity of the network. <br> 
Regularization of weights is applied on 20% of the training data.<br>

For testing the CIFAR dataset the final configuration used is: (1024,16,2)

In [10]:
class FullyConnectedNNWithRegulization(object):
    """
    Implementation of a Fully connected feed forward Neural Network with L2 Regularization as enhancement. 
    This implementation implements only one hidden layer.
    """
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01, regularization_lambda=0.01, activation = 'relu'):
        """
        Constructor for the network. This function initializes the weight matrices using HE initialization
        and sets up necessary variables to be used while tarining and prediction
        
        Parameters:
        input_size: the number of input neurons
        hidden_size: the number of hidden neurons
        output_size: the number of output neurons i.e. number of classes in the data
        learning_rate: the learning_rate used while training the weights. Default: 0.01
        regulaziation_lambda: the regularization lambda used for regulizing the weights. Default: 0.01
        activation: the activation function to be used in the hidden layer of the network. Default: 'relu'
        
        Returns: None
        """
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.W_xh = np.random.randn(self.hidden_size, self.input_size) * np.sqrt(2/(self.input_size)) # Weight for input layer to hidden layer
        self.W_hy = np.random.randn(self.output_size, self.hidden_size) * np.sqrt(2/(self.hidden_size)) # Weight for hidden layer to output layer
        self.bias_h = np.zeros((self.hidden_size, 1)) # hidden layer bias
        self.bias_y = np.zeros((self.output_size, 1)) # output layer bias
        self.learning_rate = learning_rate
        self.regularization_lambda = regularization_lambda
        self.activations = {'relu': self._relu, 'sigmoid': self._sigmoid, 'tanh': self._tanh}
        self.derivative_activations = {'relu': self._relu_derivative, 'sigmoid': self._sigmoid_derivative, 'tanh': self._tanh_derivative}
        try:
            self.activation_fn = self.activations[activation]
            self.activation_fn_derivative = self.derivative_activations[activation]
        except:
            print("Supports : 'sigmoid', 'tanh', 'relu' activations only. Please use only one of these.")
    #activation functions
    def _relu(self,Z):
        return np.maximum(Z, 0)
    def _tanh(self,Z):
        return np.tanh(Z)
    def _sigmoid(self,Z):
        return 1/(1+np.exp(-Z))
    #derivative of activation functions
    def _relu_derivative(self,Z):
        Z[Z<=0] = 0
        Z[Z>0] = 1
        return Z
    def _tanh_derivative(self,Z):
        return (1 - Z * Z)
    def _sigmoid_derivative(self,Z):
        return Z*(1-Z)

    def _forward_propagation(self, X):
        """
        This function performs forward propagation 
        Parameter:
        X: input
        
        Returns: hidden layer activations values, final softmax probs 
        """
        Z = np.dot(self.W_xh, np.reshape(X,(len(X),1))) + self.bias_h
        hidden_activations = self.activation_fn(Z) # perform non-linearity on the input data
        y_s = np.exp(np.dot(self.W_hy, hidden_activations) + self.bias_y)
        prob_values = y_s/np.sum(y_s) # softmax values
        return hidden_activations, prob_values
    
    def _back_propagation(self, X, target_class, hidden_activations, prob_values):
        """
        This function implements backward propagation i.e. calculate error in weights in each layer.
        
        Parameters:
        X: input
        target_class: output target class
        hidden_activations: hidden activation from forward pass
        prob_values: softmax probabilities of output from forward pass
        
        Returns: error in weights matrices and error in biases
        """
        delta_W_xh, delta_W_hy = np.zeros_like(self.W_xh), np.zeros_like(self.W_hy)
        delta_bias_h, delta_bias_y = np.zeros_like(self.bias_h), np.zeros_like(self.bias_y)
        
        #error calculation for the weight matrix of hidden layer to output layer and bias of output layer
        delta_y = np.copy(prob_values)
        delta_y[target_class] -= 1
        delta_W_hy = np.dot(delta_y, hidden_activations.T)
        delta_bias_y += delta_y
        
        #error calculation for the weight matrix of input layer to hidden layer and bias of hidden layer
        delta_h = np.dot(self.W_hy.T, delta_y)
        delta_h_error = self.activation_fn_derivative(hidden_activations) * delta_h 
        delta_bias_h += delta_h_error
        delta_W_xh += np.dot(delta_h_error, np.reshape(X, (len(X), 1)).T)
        
        return delta_W_xh, delta_W_hy, delta_bias_h, delta_bias_y

    def _regularize_weights(self, delta_W_hy, delta_W_xh):
        """
        Add regularization terms to the weights
        
        Parameters:
        delta_W_hy: error for weight matrix from hidden layer to output layer
        delta_W_xh: error for weight matrix from input layer to hidden layer
        Returns: updated errors with regularization factor
        """
        delta_W_hy += self.regularization_lambda * self.W_hy
        delta_W_xh += self.regularization_lambda * self.W_xh
        return delta_W_hy, delta_W_xh

    def _update_weights(self, delta_W_xh, delta_bias_h, delta_W_hy, delta_bias_y):
        """
        Update the weights and biases of the network
        
        Parameters:
        delta_W_xh: error for weight matrix from input layer to hidden layer
        delta_bias_h: error for bias for hidden layer
        delta_W_hy: error for weight matrix from hidden layer to output layer
        delta_bias_y: error for bias for output layer
        
        Returns: None
        """
        self.W_xh -= self.learning_rate * delta_W_xh
        self.bias_h -= self.learning_rate * delta_bias_h
        self.W_hy -= self.learning_rate * delta_W_hy
        self.bias_y -= self.learning_rate * delta_bias_y

    def _calc_loss_with_regularization(self, total_loss, num_samples, regularization_used=None):
        """
        Calculate the loss for the current epoch with regularization quotient
        
        Parameters:
        total_loss: sum of all loss calculated for one epoch
        num_samples: total number of training samples
        regularization_used: the type of regularization used. Supports only L2 till now. Defualt: None
        
        Returns: mean loss for an epoch with regularization quotient
        """
        if regularization_used == 'L2':
            total_loss += (self.regularization_lambda/2) * (np.sum(np.square(self.W_xh)) + np.sum(np.square(self.W_hy)))
            return 1./num_samples * total_loss
        else:
            return 1./num_samples * total_loss

    def train(self, inp, targets, regularization_data, num_epochs,model_file = "NNModelWithL2.pkl", regularization=None):
        """
        This function trains the network i.e. by doing a forward pass then a backward prop and then subsequently 
        update the weights with the errors calculated in the backward pass
        
        Parameters:
        inp: list of input samples
        targets: list of corresponding training output classes 
        num_epochs: number of epochs for training the network
        model_file: filename of the pickle to save the model after training
        
        Returns: None
        """
        for epoch_no in range(num_epochs):
            total_loss = 0
            for inp_sample_no in range(len(inp)):
                # forward propagation
                hidden_activations, prob_values = self._forward_propagation(inp[inp_sample_no])
                total_loss += -np.log(prob_values[targets[inp_sample_no], 0])

                # backward propagation
                delta_W_xh, delta_W_hy, delta_bias_h, delta_bias_y = self._back_propagation(inp[inp_sample_no], targets[inp_sample_no], hidden_activations, prob_values)
                    
                # update the weights of the model with the error calculated in back prop
                self._update_weights(delta_W_xh, delta_bias_h, delta_W_hy, delta_bias_y)
            
            #Regularization samples
            for i in range(len(regularization_data[0])):
                # forward propagation
                hidden_activations, prob_values = self._forward_propagation(regularization_data[0][i])
                total_loss += -np.log(prob_values[regularization_data[1][i], 0])

                # backward propagation
                delta_W_xh, delta_W_hy, delta_bias_h, delta_bias_y = self._back_propagation(regularization_data[0][i], regularization_data[1][i], hidden_activations, prob_values)

                if regularization == 'L2':
                    delta_W_hy, delta_W_xh = self._regularize_weights(delta_W_hy, delta_W_xh)

                # update the weights of the model with the error calculated in back prop
                self._update_weights(delta_W_xh, delta_bias_h, delta_W_hy, delta_bias_y)
            
            print("Epoch ", epoch_no, ": Loss: ", self._calc_loss_with_regularization(total_loss, len(inp)+len(regularization_data[0]), regularization))
            
        self.save(model_file)


    def predict(self, X):
        """
        This function predicts the output class i.e. performs forward propagations and returns the class 
        with maximum probabilty from the softmax(output layer) layer probs. 
        
        Parameters:
        X: input to test
        
        Return: the predicted output class
        """
        hidden_activations, prob_values = self._forward_propagation(X)
        return np.argmax(prob_values)

    def save(self, model_file):
        """
        This function dumps the model to a file. So that it can loaded later.
        
        Parameters:
        model_filename: filename of the pickled model
        
        Returns: None
        """
        pickle.dump(self, open(model_file, 'wb'))

In [13]:
nn=FullyConnectedNNWithRegulization(1024,16,2, activation ='sigmoid')
inputs, targets=[], []
num_epocs, counter = 50, 0
for i in range(len(cat_samples['train'])):
    inputs.append(cat_samples['train'][i])
    targets.append(0)
    inputs.append(deer_samples['train'][i])
    targets.append(1)
training_size = int(0.8*len(targets))
regularization_size = int(0.2*len(targets))
nn.train(inputs[:training_size], targets[:training_size],(inputs[training_size:],targets[training_size:]) ,num_epocs, regularization='L2')
# nn.train(inputs, targets, num_epocs, regularization='L2')
for j in range(0,len(deer_samples['test'])):
    s=nn.predict(deer_samples['test'][j])
    if s == 1 :
        counter+=1
    s=nn.predict(cat_samples['test'][j])
    if s == 0 :
        counter+=1
print("Correctly identified test cases : ",counter)
print("Accuracy : ",((counter*1.0)/((j+1)*2))*100)

Epoch  0 : Loss:  0.6653260932885826
Epoch  1 : Loss:  0.616173938489198
Epoch  2 : Loss:  0.599900779422258
Epoch  3 : Loss:  0.592413738673131
Epoch  4 : Loss:  0.5873619432071301
Epoch  5 : Loss:  0.5834187165307422
Epoch  6 : Loss:  0.5801078331261527
Epoch  7 : Loss:  0.5772708359207818
Epoch  8 : Loss:  0.5747560435685782
Epoch  9 : Loss:  0.5724214231239936
Epoch  10 : Loss:  0.5701917389369742
Epoch  11 : Loss:  0.567965948638451
Epoch  12 : Loss:  0.5657507682036469
Epoch  13 : Loss:  0.5635731353988305
Epoch  14 : Loss:  0.5614816446887412
Epoch  15 : Loss:  0.5595765611471618
Epoch  16 : Loss:  0.5578649923221459
Epoch  17 : Loss:  0.5563368694620691
Epoch  18 : Loss:  0.5549771568888267
Epoch  19 : Loss:  0.5537731392853983
Epoch  20 : Loss:  0.5527159811140003
Epoch  21 : Loss:  0.5517884083212761
Epoch  22 : Loss:  0.5509613498311312
Epoch  23 : Loss:  0.5501946230352354
Epoch  24 : Loss:  0.5494437431448452
Epoch  25 : Loss:  0.5487057450314732
Epoch  26 : Loss:  0.54797

### Part 4: Enchancement Implementation - Adding another hidden layer

As network configuration was (1024,8,2) with 1024 input, 8 hidden and 2 output nodes, a good image embedding is not acheived in a single hidden layer. Adding another hidden layer might improve the emebdding of the input image, thus making the model deep. Implementation below uses two hidden layers to train and classify the images and attempts to solve the problem of underfitting.

For testing the CIFAR dataset the final configuration used is: (1024,32,8,2)

In [26]:
class FullyConnectedNNWithTwoHiddenLayer(object):
    """
    Implementation of Fully Connected Neural Network with two hidden layers
    """
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size, learning_rate=0.01, activation1 = 'relu', activation2 ='tanh'):
        """
        Constructor for the network. This function initializes the weight matrices using HE initialization
        and sets up necessary variables to be used while tarining and prediction
        
        Parameters:
        input_size: the number of input neurons
        hidden1_size: the number of neurons in hidden layer 1
        hidden2_size: the number of neurons in hidden layer 1
        output_size: the number of output neurons i.e. number of classes in the data
        learning_rate: the learning_rate used while training the weights. Default value = 0.01
        activation1: the activation function to be used in the first hidden layer of the network. Default: 'relu'
        activation2: the activation function to be used in the second hidden layer of the network. Default: 'tanh'
        
        Returns: None
        """
        self.input_size = input_size 
        self.output_size = output_size
        self.hidden1_size = hidden1_size
        self.hidden2_size = hidden2_size
        self.Wxh1 = np.random.randn(self.hidden1_size, self.input_size) * np.sqrt(2/(self.hidden1_size+self.input_size)) # Weight for input layer to first hidden layer
        self.Wh1h2 = np.random.randn(self.hidden2_size,self.hidden1_size) * np.sqrt(2/(self.hidden2_size+self.hidden1_size)) # Weight for first hidden layer to second hidden layer
        self.Wh2y = np.random.randn(self.output_size, self.hidden2_size) * np.sqrt(2/(self.output_size+self.hidden2_size)) # Weight for second hidden layer to output layer
        self.bh1 = np.zeros((self.hidden1_size, 1)) # first hidden layer bias
        self.bh2 = np.zeros((self.hidden2_size, 1)) # second hidden layer bias
        self.by = np.zeros((self.output_size, 1)) # output bias
        self.learning_rate = learning_rate
        self.activations = {'relu': self._relu, 'sigmoid': self._sigmoid, 'tanh': self._tanh}
        self.derivative_activations = {'relu': self._relu_derivative, 'sigmoid': self._sigmoid_derivative, 'tanh': self._tanh_derivative}
        self.activation1 = self.activations[activation1]
        self.activation1_derivative = self.derivative_activations[activation1]
        self.activation2 = self.activations[activation2]
        self.activation2_derivative = self.derivative_activations[activation2]
    #activation functions
    def _relu(self,Z):
        return np.maximum(Z, 0)
    def _tanh(self,Z):
        return np.tanh(Z)
    def _sigmoid(self,Z):
        return 1/(1+np.exp(-Z))
    #derivative of activation functions
    def _relu_derivative(self,Z):
        Z[Z<=0] = 0
        Z[Z>0] = 1
        return Z
    def _tanh_derivative(self,Z):
        return (1 - Z * Z)
    def _sigmoid_derivative(self,Z):
        return Z*(1-Z)
    
    def _forward_propagation(self, X):
        """
        This function performs forward propagation 
        Parameter:
        X: input
        
        Returns: hidden layer activations values, final softmax probs 
        """
        Z = np.dot(self.Wxh1, np.reshape(X,(len(X),1))) + self.bh1
        h1_a = self.activation1(Z)# perform non-linearity on the input data
        Z = np.dot(self.Wh1h2, h1_a) + self.bh2
        h2_a = self.activation2(Z)# perform non-linearity on the first hidden layer activations
        y_a = np.exp(np.dot(self.Wh2y, h2_a) + self.by)
        probs = y_a/np.sum(y_a)
        return h1_a, h2_a, probs

    def _back_propagation(self, X, t, h1_a, h2_a, probs):
        """
        This function implements backward propagation i.e. calculate error in weights in each layer.
        
        Parameters:
        X: input
        t: output target class
        hidden_activations: hidden activation from forward pass
        prob_values: softmax probabilities of output from forward pass
        
        Returns: error in weights matrices and error in biases
        """
        dWxh1, dWh1h2, dWh2y = np.zeros_like(self.Wxh1), np.zeros_like(self.Wh1h2), np.zeros_like(self.Wh2y)
        dbh1, dbh2, dby = np.zeros_like(self.bh1), np.zeros_like(self.bh2), np.zeros_like(self.by)

        #error calculation in the last layer
        dy = np.copy(probs)
        dy[t] -= 1
        dWh2y = np.dot(dy, h2_a.T)
        dby += dy

        #error calculation in second hidden layer
        dh2 = np.dot(self.Wh2y.T,dy)
        dh2raw = self.activation2_derivative(h2_a) * dh2 
        dbh2 += dh2raw
        dWh1h2 += np.dot(dh2raw, h1_a.T)

        #error calculation in the first hidden layer
        dh1 = np.dot(self.Wh1h2.T, dh2raw) 
        dh1raw = self.activation1_derivative(h1_a) * dh1
        dbh1 += dh1raw
        dWxh1 += np.dot(dh1raw, np.reshape(X, (len(X), 1)).T)
        
        return dWxh1, dWh1h2, dWh2y, dbh1, dbh2, dby

    def _update_weights(self, dWxh1, dbh1, dWh1h2, dbh2, dWh2y, dby):
        """
        Update the weights and biases of the network
        
        Parameters:
        dWxh1: error for weight matrix from input layer to first hidden layer
        dbh1: error for bias for first hidden layer
        dWh1h2: error for weight matrix from first hidden layer to second hidden layer
        dbh2: error for bias for second hidden layer
        dWh2y: error for weight matrix from second hidden layer to output layer
        dby: error for bias for output layer
        
        Returns: None
        """
        self.Wxh1 -= self.learning_rate * dWxh1
        self.bh1 -= self.learning_rate * dbh1
        self.Wh1h2 -= self.learning_rate * dWh1h2
        self.bh2 -= self.learning_rate * dbh2
        self.Wh2y -= self.learning_rate * dWh2y
        self.by -= self.learning_rate * dby

    def _calc_mean_loss(self, total_loss, num_samples):
        """
        Calculate the mean loss for the current epoch
        
        Parameters:
        total_loss: sum of all loss calculated for one epoch
        num_examples: total number of training sample
        
        Returns: mean loss for an epoch
        """
        return 1./num_samples * total_loss

    def train(self, inp, targets, num_epochs, model_file = 'TwoLayerNNModel.pkl'):
        """
        This function trains the network i.e. by doing a forward pass then a backward prop and then subsequently 
        update the weights with the errors calculated in the backward pass
        
        Parameters:
        inp: list of input samples
        targets: list of corresponding training output classes 
        num_epochs: number of epochs for training the network
        model_file: filename of the pickle to save the model after training
        
        Returns: None
        """
        for epoch_no in range(num_epochs):#looping through each input sample
            total_loss = 0
            for inp_sample_no in range(len(inp)):
                # forward propagation
                h1_a, h2_a, probs = self._forward_propagation(inp[inp_sample_no])
                total_loss += -np.log(probs[targets[inp_sample_no], 0])

                # backward propagation
                dWxh1, dWh1h2, dWh2y, dbh1, dbh2, dby = self._back_propagation(inp[inp_sample_no], targets[inp_sample_no], h1_a, h2_a, probs)

                # update the weights of the model with the error calculated in back prop
                self._update_weights(dWxh1, dbh1, dWh1h2, dbh2, dWh2y, dby)

            print("Epoch ", epoch_no, ", Loss: ", self._calc_mean_loss(total_loss, len(inp)))
        self.save(model_file)

    def predict(self, X):
        """
        This function predicts the output class i.e. performs forward propagations and returns the class 
        with maximum probabilty from the softmax(output layer) layer probs. 
        
        Parameters:
        X: input to test
        
        Return: the predicted output class
        """
        h1_a, h2_a, probs = self._forward_propagation(X)
        return np.argmax(probs)

    def save(self, model_file):
        """
        This function dumps the model to a file. So that it can loaded later.
        
        Parameters:
        model_filename: filename of the pickled model
        
        Returns: None
        """
        pickle.dump(self, open(model_file, 'wb'))

In [21]:
nn=FullyConnectedNNWithTwoHiddenLayer(1024,32,8,2,activation1='relu',activation2='sigmoid')
inputs, targets=[], []
num_epocs, counter = 50, 0
for i in range(len(cat_samples['train'])):
    inputs.append(cat_samples['train'][i])
    targets.append(0)
    inputs.append(deer_samples['train'][i])
    targets.append(1)
nn.train(inputs, targets, num_epocs)
for j in range(0,len(deer_samples['test'])):
    s=nn.predict(deer_samples['test'][j])
    if s == 1 :
        counter+=1
    s=nn.predict(cat_samples['test'][j])
    if s == 0 :
        counter+=1
print("Correctly identified test cases : ",counter)
print("Accuracy : ",((counter*1.0)/((j+1)*2))*100)

Epoch  0 , Loss:  0.6555052825090637
Epoch  1 , Loss:  0.6048372690987214
Epoch  2 , Loss:  0.584576707112227
Epoch  3 , Loss:  0.5721347269857121
Epoch  4 , Loss:  0.5605916505223151
Epoch  5 , Loss:  0.5495321042672581
Epoch  6 , Loss:  0.5396838327807107
Epoch  7 , Loss:  0.5335342620270234
Epoch  8 , Loss:  0.5271578664233311
Epoch  9 , Loss:  0.5169966591286511
Epoch  10 , Loss:  0.5086317445540611
Epoch  11 , Loss:  0.502199243724382
Epoch  12 , Loss:  0.4955777629039728
Epoch  13 , Loss:  0.4887897001959324
Epoch  14 , Loss:  0.4819299627115247
Epoch  15 , Loss:  0.4763177836668254
Epoch  16 , Loss:  0.47254835741931184
Epoch  17 , Loss:  0.4658534117000471
Epoch  18 , Loss:  0.46147561262595804
Epoch  19 , Loss:  0.4543343540454525
Epoch  20 , Loss:  0.44640331767450625
Epoch  21 , Loss:  0.44768524116970265
Epoch  22 , Loss:  0.4420000131393422
Epoch  23 , Loss:  0.43293050895537427
Epoch  24 , Loss:  0.43592675798612684
Epoch  25 , Loss:  0.43064763994066146
Epoch  26 , Loss:

### References

- Neural network from scratch: https://towardsdatascience.com/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6
- HE Initialisation: https://towardsdatascience.com/weight-initialization-techniques-in-neural-networks-26c649eb3b78
- Converting image to grayscale: https://www.tutorialspoint.com/dip/grayscale_to_rgb_conversion.htm
- L2 Regularisation: https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c
- Improve Neural Network: https://towardsdatascience.com/how-to-improve-a-neural-network-with-regularization-8a18ecda9fe3
- Andrew Ng Notes: https://medium.com/@keonyonglee/bread-and-butter-from-deep-learning-by-andrew-ng-course-1-neural-networks-and-deep-learning-41563b8fc5d8
- Lecture Notes for Deep Learning by Prof Michael Madden, NUIG
- Lecture Notes for Machine Learning by Prof Anantharaman Narayana Iyer, PESIT