In [78]:
import numpy as np
import sys

In [79]:
#functions of non-linear activations
def f_sigmoid(X, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-X))
    else:
        return f_sigmoid(X)*(1 - f_sigmoid(X))


def f_softmax(X):
    Z = np.sum(np.exp(X), axis=1)
    Z = Z.reshape(Z.shape[0], 1)
    return np.exp(X) / Z

def f_relu(X, deriv=False):
    if not deriv:
        return np.maximum(0, X)
    else:
        return (X > 0).astype(float)

In [80]:
def exit_with_err(err_str):
    print >> sys.stderr, err_str
    sys.exit(1)

In [81]:
#Functionality of a single hidden layer
class Layer:
    def __init__(self, size, batch_size, is_input=False, is_output=False,
                 activation=f_sigmoid):
        self.is_input = is_input
        self.is_output = is_output

        # Z is the matrix that holds output values
        self.Z = np.zeros((batch_size, size[0]))
        # The activation function is an externally defined function (with a
        # derivative) that is stored here
        self.activation = activation

        # W is the outgoing weight matrix for this layer
        self.W = None
        # S is the matrix that holds the inputs to this layer
        self.S = None
        # D is the matrix that holds the deltas for this layer
        self.D = None
        # Fp is the matrix that holds the derivatives of the activation function
        self.Fp = None

        if not is_input:
            self.S = np.zeros((batch_size, size[0]))
            self.D = np.zeros((batch_size, size[0]))

        if not is_output:
            self.W = np.random.normal(size=size, scale=1E-4)

        if not is_input and not is_output:
            self.Fp = np.zeros((size[0], batch_size))

    def forward_propagate(self):
        if self.is_input:
            return self.Z.dot(self.W)

        self.Z = self.activation(self.S)
        if self.is_output:
            return self.Z
        else:
            # For hidden layers, we add the bias values here
            self.Z = np.append(self.Z, np.ones((self.Z.shape[0], 1)), axis=1)
            self.Fp = self.activation(self.S, deriv=True).T
            return self.Z.dot(self.W)


In [82]:
class MultiLayerPerceptron:
    def __init__(self, layer_config, batch_size=100, activation=f_sigmoid):
        self.layers = []
        self.num_layers = len(layer_config)
        self.minibatch_size = batch_size

        for i in range(self.num_layers-1):
            if i == 0:
                print ("Initializing input layer with size {0}.".format(layer_config[i]))
                # Here, we add an additional unit at the input for the bias
                # weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         is_input=True))
            else:
                print ("Initializing hidden layer with size {0}.".format(layer_config[i]))
                # Here we add an additional unit in the hidden layers for the
                # bias weight.
                self.layers.append(Layer([layer_config[i]+1, layer_config[i+1]],
                                         batch_size,
                                         activation=activation))

        print ("Initializing output layer with size {0}.".format(layer_config[-1]))
        self.layers.append(Layer([layer_config[-1], None],
                                 batch_size,
                                 is_output=True,
                                 activation=f_softmax))
        print ("Done!")

    def forward_propagate(self, data):
        # We need to be sure to add bias values to the input
        self.layers[0].Z = np.append(data, np.ones((data.shape[0], 1)), axis=1)

        for i in range(self.num_layers-1):
            self.layers[i+1].S = self.layers[i].forward_propagate()
        return self.layers[-1].forward_propagate()

    def backpropagate(self, yhat, labels):
        
        # exit_with_err("FIND ME IN THE CODE, What is computed in the next line of code?\n")
        # Computes the error of the output (i.e loss function)

        self.layers[-1].D = (yhat - labels).T
        for i in range(self.num_layers-2, 0, -1):
            # We do not calculate deltas for the bias values
            W_nobias = self.layers[i].W[0:-1, :]
            
            # exit_with_err("FIND ME IN THE CODE, What does this 'for' loop do?\n")
            # calculate the delta values (error signals) for each layer, excluding the bias terms.
            
            
            self.layers[i].D = W_nobias.dot(self.layers[i+1].D) * self.layers[i].Fp

    def update_weights(self, eta):
        for i in range(0, self.num_layers-1):
            W_grad = -eta*(self.layers[i+1].D.dot(self.layers[i].Z)).T
            self.layers[i].W += W_grad

    def evaluate(self, train_data, train_labels, test_data, test_labels,
                 num_epochs=70, eta=0.05, eval_train=False, eval_test=True):

        N_train = len(train_labels)*len(train_labels[0])
        N_test = len(test_labels)*len(test_labels[0])

        print ("Training for {0} epochs...".format(num_epochs))
        for t in range(0, num_epochs):
            out_str = "[{0:4d}] ".format(t)

            for b_data, b_labels in zip(train_data, train_labels):
                output = self.forward_propagate(b_data)
                self.backpropagate(output, b_labels)
                
                # exit_with_err("FIND ME IN THE CODE, How does weight update is implemented? What is eta?\n")
                #The gradient descent algorithm is used to update the weights via backpropagation
                # by moving them in the direction that minimizes the loss function.
                #eta is the learning rate, Hyperparameter that controls step size in updating weights

                self.update_weights(eta=eta)

            if eval_train:
                errs = 0
                for b_data, b_labels in zip(train_data, train_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Training error: {1:.5f}".format(out_str,
                                                           float(errs)/N_train))

            if eval_test:
                errs = 0
                for b_data, b_labels in zip(test_data, test_labels):
                    output = self.forward_propagate(b_data)
                    yhat = np.argmax(output, axis=1)
                    errs += np.sum(1-b_labels[np.arange(len(b_labels)), yhat])

                out_str = ("{0} Test error: {1:.5f}").format(out_str,
                                                       float(errs)/N_test)

            print (out_str)


In [83]:
def label_to_bit_vector(labels, nbits):
    bit_vector = np.zeros((labels.shape[0], nbits))
    for i in range(labels.shape[0]):
        bit_vector[i, labels[i]] = 1.0

    return bit_vector

In [84]:
def create_batches(data, labels, batch_size, create_bit_vector=False):
    N = data.shape[0]
    print ("Batch size {0}, the number of examples {1}.".format(batch_size,N))

    if N % batch_size != 0:
        print ("Warning in create_minibatches(): Batch size {0} does not " \
              "evenly divide the number of examples {1}.".format(batch_size,N))
    chunked_data = []
    chunked_labels = []
    idx = 0
    while idx + batch_size <= N:
        chunked_data.append(data[idx:idx+batch_size, :])
        if not create_bit_vector:
            chunked_labels.append(labels[idx:idx+batch_size])
        else:
            bit_vector = label_to_bit_vector(labels[idx:idx+batch_size], 10)
            chunked_labels.append(bit_vector)

        idx += batch_size

    return chunked_data, chunked_labels


In [85]:
def prepare_for_backprop(batch_size, Train_images, Train_labels, Valid_images, Valid_labels):
    
    print ("Creating data...")
    batched_train_data, batched_train_labels = create_batches(Train_images, Train_labels,
                                              batch_size,
                                              create_bit_vector=True)
    batched_valid_data, batched_valid_labels = create_batches(Valid_images, Valid_labels,
                                              batch_size,
                                              create_bit_vector=True)
    print ("Done!")


    return batched_train_data, batched_train_labels,  batched_valid_data, batched_valid_labels



In [86]:
def get_accuracy(model, data, labels):
    yhat = model.forward_propagate(data)
    yhat = np.argmax(yhat, axis=1)
    return np.sum(yhat == labels) / float(labels.shape[0])

In [87]:
from keras.datasets import mnist

In [88]:
(Xtr, Ltr), (X_test, L_test)=mnist.load_data()

Xtr = Xtr.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
Xtr = Xtr.astype('float32')
X_test = X_test.astype('float32')
Xtr /= 255
X_test /= 255
print(Xtr.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')


60000 train samples
10000 test samples


In [None]:
batch_size=100;
eta = 0.05 #Default learning rate
print(f'learning rate {eta} are used for training')
train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)

mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

mlp.evaluate(train_data, train_labels, valid_data, valid_labels, eval_train=True, eta=eta)

accuarcy = get_accuracy(mlp, X_test, L_test)
print("Accuracy on test data: {0:.2f}%".format(accuarcy*100))

print("Done:)\n")


Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.57643 Test error: 0.57780
[   1]  Training error: 0.07788 Test error: 0.07740
[   2]  Training error: 0.05045 Test error: 0.05600
[   3]  Training error: 0.03900 Test error: 0.04280
[   4]  Training error: 0.03260 Test error: 0.03670
[   5]  Training error: 0.02582 Test error: 0.03560
[   6]  Training error: 0.02465 Test error: 0.03570
[   7]  Training error: 0.02427 Test error: 0.03570
[   8]  Training error: 0.02155 Test error: 0.03480
[   9]  Training error: 0.01913 Test error: 0.03400
[  10]  Training error: 0.01875 Test error: 0.03440
[  11]  Training error: 0.01422 Test error: 0.02970
[  12]  Training error: 0.02162 Test error: 0.03620
[  13]  Trainin

##  differences in the functionality of the multi-layer perceptron
1. **Learning rate** = `0.005` 
2. **Learning rate** = `0.5` 

In [91]:
num_epochs=70
eta=0.005
print(f'{num_epochs} epochs and learning rate {eta} are used for training')

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)
mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)
mlp.evaluate(train_data, train_labels, valid_data, valid_labels,num_epochs=num_epochs,eta=eta ,eval_train=True)

accuarcy = get_accuracy(mlp, X_test, L_test)
print("Accuracy on test data: {0:.2f}%".format(accuarcy*100))

70 epochs and learning rate 0.005 are used for training
Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.70335 Test error: 0.70070
[   1]  Training error: 0.64715 Test error: 0.64320
[   2]  Training error: 0.59943 Test error: 0.59790
[   3]  Training error: 0.45213 Test error: 0.46330
[   4]  Training error: 0.21243 Test error: 0.20410
[   5]  Training error: 0.11463 Test error: 0.11340
[   6]  Training error: 0.08957 Test error: 0.08910
[   7]  Training error: 0.07602 Test error: 0.07660
[   8]  Training error: 0.06415 Test error: 0.06490
[   9]  Training error: 0.05600 Test error: 0.05680
[  10]  Training error: 0.04903 Test error: 0.05170
[  11]  Training error: 0.04390 Test error: 0.04650
[  12]  Tra

In [92]:
num_epochs=70
eta=0.5
print(f'{num_epochs} epochs and learning rate {eta} are used for training')

train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)
mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)
mlp.evaluate(train_data, train_labels, valid_data, valid_labels,num_epochs=num_epochs,eta=eta ,eval_train=True)

accuarcy = get_accuracy(mlp, X_test, L_test)
print("Accuracy on test data: {0:.2f}%".format(accuarcy*100))

70 epochs and learning rate 0.5 are used for training
Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.90248 Test error: 0.90260
[   1]  Training error: 0.88763 Test error: 0.88650
[   2]  Training error: 0.90137 Test error: 0.90420
[   3]  Training error: 0.90085 Test error: 0.89910
[   4]  Training error: 0.90128 Test error: 0.90200
[   5]  Training error: 0.90263 Test error: 0.90180
[   6]  Training error: 0.90085 Test error: 0.89910
[   7]  Training error: 0.90137 Test error: 0.90420
[   8]  Training error: 0.88763 Test error: 0.88650
[   9]  Training error: 0.88763 Test error: 0.88650
[  10]  Training error: 0.90137 Test error: 0.90420
[  11]  Training error: 0.90248 Test error: 0.90260
[  12]  Train

## ReLU 

In [93]:
eta = 0.005
train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Xtr, Ltr, X_test, L_test)
mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size, activation=f_relu)
mlp.evaluate(train_data, train_labels, valid_data, valid_labels,eval_train=True, eta=eta)

accuarcy = get_accuracy(mlp, X_test, L_test)
print("Accuracy on test data: {0:.2f}%".format(accuarcy*100))

Creating data...
Batch size 100, the number of examples 60000.
Batch size 100, the number of examples 10000.
Done!
Initializing input layer with size 784.
Initializing hidden layer with size 100.
Initializing hidden layer with size 100.
Initializing output layer with size 10.
Done!
Training for 70 epochs...
[   0]  Training error: 0.89558 Test error: 0.89720
[   1]  Training error: 0.89558 Test error: 0.89720
[   2]  Training error: 0.89558 Test error: 0.89720
[   3]  Training error: 0.89558 Test error: 0.89720
[   4]  Training error: 0.89558 Test error: 0.89720
[   5]  Training error: 0.89558 Test error: 0.89720
[   6]  Training error: 0.89558 Test error: 0.89720
[   7]  Training error: 0.89558 Test error: 0.89720
[   8]  Training error: 0.89558 Test error: 0.89720
[   9]  Training error: 0.89558 Test error: 0.89720
[  10]  Training error: 0.89558 Test error: 0.89720
[  11]  Training error: 0.89558 Test error: 0.89720
[  12]  Training error: 0.89558 Test error: 0.89720
[  13]  Trainin

In [94]:
# Don't know why this is needed
# batch_size=100;


# train_data, train_labels, valid_data, valid_labels=prepare_for_backprop(batch_size, Train_images, Train_labels, Valid_images, Valid_labels)

# mlp = MultiLayerPerceptron(layer_config=[784, 100, 100, 10], batch_size=batch_size)

# mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
#              eval_train=True)

# print("Done:)\n")
    


## Questions 1

### A: Core Concept of the Backpropagation Algorithm
Backpropagation is an optimization technique used to train neural networks. It works by calculating the difference between the predicted output and the actual result, and then adjusting the network's weights by propagating this error back through its layers.

### B: Purpose and Function of the Softmax Layer
The Softmax function transforms a set of raw prediction scores into probabilities. This normalization ensures that the sum of all outputs equals 1, allowing the outputs to represent the likelihood of each class.

### C: Frequently Used Non-Linear Output Functions and Their Effects

1. **ReLU (Rectified Linear Unit):**
- **Key Attribute:** Highly efficient in terms of computation.
- **Application:** Widely used in hidden layers to address issues like vanishing gradients.

2. **Sigmoid Function:**
- **Key Attribute:** Produces values between 0 and 1, making it ideal for representing probabilities.
- **Application:** Commonly used in binary classification problems.

3. **Softmax Function:**
- **Key Attribute:** Similar to Sigmoid, it outputs probabilities but does so across multiple classes, ensuring the total is always 1.
- **Application:** Typically used in the output layer of multi-class classification models to determine the predicted class.

4. **Hyperbolic Tangent (Tanh):**
- **Key Attribute:** Outputs values ranging from -1 to 1, making it effective for differentiating between negative, neutral, and positive states.
- **Application:** Often utilized as an activation function in hidden layers because it is zero-centered, reducing systematic biases in the model.
- **Benefit:** Enables faster and more effective gradient descent by ensuring a balanced distribution of gradients during training.

