# Load and prepare the data

In [1]:
import sys
import numpy as np
from keras.datasets import mnist

#load the dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

images, labels = (x_train[0:1000].reshape(1000, 28*28) / 255, y_train[0:1000])

one_hot_labels = np.zeros((len(labels), 10))

for i, l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = x_test.reshape(len(x_test), 28*28) / 255
test_labels = np.zeros((len(y_test), 10))
for i, l in enumerate(y_test):
    test_labels[i][l] = 1

In [3]:
np.random.seed(1)
relu = lambda x:(x>=0) * x
relu2deriv = lambda x: x>=0

# v1 - overfitted

In [10]:
alpha, iterations, hidden_size, pixels_per_image, num_labels = (0.005, 350, 40, 784, 10)
weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

#Training
for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    
    for i in range(len(images)):
        layer_0 = images[i:i+1]
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        layer_2 = np.dot(layer_1, weights_1_2)
        
        error += np.sum((labels[i:i+1] - layer_2) ** 2)
        correct_cnt += int(np.argmax(layer_2) == np.argmax(labels[i:i+1]))
        
        layer_2_delta = (labels[i:i+1] - layer_2)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
        
    sys.stdout.write("\r" + \
                    " I: " + str(j) + \
                    " Error: " + str(error/float(len(images))) [0:5] + \
                    " Correct: " + str(correct_cnt/float(len(images)))
                    )
    
    #Testing
    if(j % 10 == 0 or j == iterations-1):
        error, correct_cnt = (0.0, 0)

        for i in range(len(test_images)):

            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            error += np.sum((test_labels[i:i+1] - layer_2) ** 2)
            correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))
        sys.stdout.write(" Test-Err:" + str(error/float(len(test_images)))[0:5] + \
                        " Test-Acc:" + str(correct_cnt/float(len(test_images))))
        print()

 I: 0 Error: 0.722 Correct: 0.537 Test-Err:0.601 Test-Acc:0.6488
 I: 10 Error: 0.312 Correct: 0.901 Test-Err:0.420 Test-Acc:0.8114
 I: 20 Error: 0.260 Correct: 0.937 Test-Err:0.414 Test-Acc:0.8111
 I: 30 Error: 0.232 Correct: 0.946 Test-Err:0.417 Test-Acc:0.8066
 I: 40 Error: 0.215 Correct: 0.956 Test-Err:0.426 Test-Acc:0.8019
 I: 50 Error: 0.204 Correct: 0.966 Test-Err:0.437 Test-Acc:0.7982
 I: 60 Error: 0.194 Correct: 0.967 Test-Err:0.448 Test-Acc:0.7921
 I: 70 Error: 0.186 Correct: 0.975 Test-Err:0.458 Test-Acc:0.7864
 I: 80 Error: 0.179 Correct: 0.979 Test-Err:0.466 Test-Acc:0.7817
 I: 90 Error: 0.172 Correct: 0.981 Test-Err:0.474 Test-Acc:0.7758
 I: 100 Error: 0.166 Correct: 0.984 Test-Err:0.482 Test-Acc:0.7706
 I: 110 Error: 0.161 Correct: 0.984 Test-Err:0.489 Test-Acc:0.7686
 I: 120 Error: 0.157 Correct: 0.986 Test-Err:0.496 Test-Acc:0.766
 I: 130 Error: 0.153 Correct: 0.999 Test-Err:0.502 Test-Acc:0.7622
 I: 140 Error: 0.149 Correct: 0.991 Test-Err:0.508 Test-Acc:0.758
 I: 150 

# v2 - Applying dropout

In [5]:
alpha, iterations, hidden_size = (0.005, 300, 100)
pixels_per_image, num_labels = (784, 10)

weights_0_1 = 0.2 * np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, num_labels)) - 0.1

for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    for i in range(len(images)):
        layer_0 = images[i:i+1]
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        
        dropout_mask = np.random.randint(2, size=layer_1.shape) # Initialize dropout
        
        layer_1 *= dropout_mask * 2 # Apply Dropout on feedfoward 
        
        layer_2 = np.dot(layer_1, weights_1_2)
        
        error += np.sum((labels[i:i+1] - layer_2) ** 2)
        
        correct_cnt +=  int(np.argmax(layer_2) == np.argmax(labels[i:i+1]))
        
        layer_2_delta = (labels[i:i+1] - layer_2)
        
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
        
        layer_1_delta *= dropout_mask # Apply dropout on backpropagation
        
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
        
    if (j % 10 == 0):
        test_error = 0.0
        test_correct_cnt = 0

        for i in range(len(test_images)):
            layer_0 = test_images[i:i+1]

            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            test_error += np.sum((test_labels[i:i+1] - layer_2) ** 2)

            test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))

        sys.stdout.write("\n" + \
                        "I:" + str(j) + \
                         " Test-Err:" + str(test_error/ float(len(test_images))) + \
                         " Test-Acc:" + str(test_correct_cnt/ float(len(test_images))) + \
                         " Train-Err:" + str(error/ float(len(images)))[0:5] + \
                         " Train-Acc:" + str(correct_cnt/ float(len(images)))
                        )


I:0 Test-Err:0.6562969375047616 Test-Acc:0.6291 Train-Err:0.897 Train-Acc:0.401
I:10 Test-Err:0.44211270075128084 Test-Acc:0.7917 Train-Err:0.455 Train-Acc:0.785
I:20 Test-Err:0.41269537680003604 Test-Acc:0.8081 Train-Err:0.421 Train-Acc:0.819
I:30 Test-Err:0.4145106330998193 Test-Acc:0.8167 Train-Err:0.411 Train-Acc:0.839
I:40 Test-Err:0.41634566218836216 Test-Acc:0.8175 Train-Err:0.405 Train-Acc:0.814
I:50 Test-Err:0.40660989183431623 Test-Acc:0.8177 Train-Err:0.385 Train-Acc:0.849
I:60 Test-Err:0.4096084249782355 Test-Acc:0.8247 Train-Err:0.383 Train-Acc:0.853
I:70 Test-Err:0.4091331823770737 Test-Acc:0.8227 Train-Err:0.385 Train-Acc:0.855
I:80 Test-Err:0.4169613541871479 Test-Acc:0.8201 Train-Err:0.374 Train-Acc:0.869
I:90 Test-Err:0.41472911122417055 Test-Acc:0.8206 Train-Err:0.359 Train-Acc:0.882
I:100 Test-Err:0.40989789079298233 Test-Acc:0.8193 Train-Err:0.365 Train-Acc:0.865
I:110 Test-Err:0.40636002244136304 Test-Acc:0.8176 Train-Err:0.356 Train-Acc:0.874
I:120 Test-Err:0.40

# v3 - Applying batch gradient descent

In [14]:
batch_size = 100 # Define the size of the batch

alpha, iterations = (0.001, 300)
pixels_per_image, num_labels, hidden_size = (784, 10, 100)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1

for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    for i in range(int(len(images) / batch_size)):
        batch_start = (i * batch_size)
        batch_end = ((i + 1) * batch_size)
        
        layer_0 = images[batch_start:batch_end]
        
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        
        dropout_mask = np.random.randint(2, size = layer_1.shape)
        
        layer_1 *= dropout_mask * 2
        
        layer_2 = np.dot(layer_1, weights_1_2)
        
        error += np.sum((labels[batch_start:batch_end] - layer_2) ** 2)
        
        for k in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[k:k+1]) == \
                              np.argmax(labels[batch_start + k:batch_start + k + 1]))
            
            layer_2_delta = (labels[batch_start:batch_end] - layer_2) / batch_size
            
            layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
            
            layer_1_delta *= dropout_mask
            
            weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
            weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
            
        if(j%10 == 0):
            test_error = 0.0
            test_correct_cnt = 0
            
            for i in range(len(test_images)):
                layer_0 = test_images[i:i+1]
                layer_1 = relu(np.dot(layer_0, weights_0_1))
                layer_2 = np.dot(layer_1, weights_1_2)

KeyboardInterrupt: 