In [1]:
import numpy as np
from matplotlib import pyplot as plt
from keras.datasets import mnist
from random import sample

(x_train, y_train), (x_test, y_test) = mnist.load_data()
SCALE_FACTOR = 255
WIDTH = x_train.shape[1]
HEIGHT = x_train.shape[2]
x_train = x_train.reshape(x_train.shape[0],WIDTH*HEIGHT).T / SCALE_FACTOR
x_test = x_test.reshape(x_test.shape[0],WIDTH*HEIGHT).T  / SCALE_FACTOR

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


# Mnist from scratch

* fast vectorised implementation using matrix operations
* simple 2 layer MLP
* 1 hidden layer size 10, reLU activation
* 1 output layer, size 10, softmax activation
* based on tutorial at https://www.youtube.com/watch?v=w8yWXqWQYmU
* I added stochastic gradient descent by randomly sampling rows, is a massive speedup on convergence

In [34]:
# global run settings
ETA = 0.01
EPOCHS=1000
SAMPLES=1000
#initialise input
A_0 = np.asarray([np.reshape(np.asarray(x),[784,]) for x in x_train.T]).T
print(A_0.shape)
#initalise weights
W_1 = np.random.randn(10,784) * np.sqrt(1./(784))
b_1 = np.random.randn(10,1) * np.sqrt(1./(10))
W_2 = np.random.randn(10,10) * np.sqrt(1./(20))
b_2 = np.random.randn(10,1) * np.sqrt(1./(784))
np.mean(W_1) #should be ~~0

(784, 60000)


0.0004056875217560816

In [35]:
def reLU(x):
    return np.maximum(x,0)

def softmax(Z):
    """Compute softmax values for each sets of scores in x."""
    exp = np.exp(Z - np.max(Z))
    return exp / exp.sum(axis=0)

def reLU_diff(x):
    return x > 0

# forward propagate
def forward_prop(A_0, W_1, b_1, W_2, b_2):
    z_1 = W_1.dot(A_0) + b_1
    # relu
    A_1 = reLU(z_1)
    z_2 = W_2.dot(A_1) + b_2
    # apply softmax
    A_2 = softmax(z_2)
    return z_1, A_1, z_2, A_2

def one_hot_Y(classnum):
    # one hot vector encoding for MNIST class labels
    res = np.zeros(10)
    res[classnum] = 1
    return res.T

def back_prop(A_0,A_1,A_2,z_1,Y,W_1,b_1,W_2,b_2):
    samp_num = Y.shape[1] #number of training samples
    # ~~~~~~~~~ dC/dW_2 ~~~~~~~~~~~~
    dz_2 = 2*(A_2 - Y)
    dW_2 = 1/(samp_num) * dz_2.dot(A_1.T)
    # ~~~~~~~~~ dC/db_2 ~~~~~~~~~~~~
    db_2 = 1/(samp_num) * np.sum(dz_2,1).reshape([10,1])
    # ~~~~~~~~~ dC/dW_1 ~~~~~~~~~~~~
    dz_1 = W_2.T.dot(dz_2) * reLU_diff(z_1) 
    dW_1 = 1/(samp_num) * dz_1.dot(A_0.T)
    # ~~~~~~~~~ dC/db_1 ~~~~~~~~~~~~
    db_1 = 1/(samp_num) * np.sum(dz_1,1).reshape([10,1])
    W_1 -= ETA * dW_1
    W_2 -= ETA * dW_2
    b_1 -= ETA * db_1
    b_2 -= ETA * db_2
    return W_1, b_1, W_2, b_2

def return_acc(A_2, Y):
    #want to iterate over cols
    A_2 = A_2.T
    Y = Y.T
    correct = 0
    for i in range(len(A_2)):
        if np.argmax(A_2[i])==np.argmax(Y[i]):
            correct += 1
    return (correct / len(A_2))

def stochastic_gradient_descent(A_0, Y, W_1, b_1, W_2, b_2, EPOCHS=EPOCHS):
    # let's perform stochastic gradient descent
    Y = np.asarray([one_hot_Y(y) for y in Y]).T
    for i in range(EPOCHS):
        # randomly sample a subset of examples
        idx = np.random.choice(A_0.shape[1], size=SAMPLES, replace=False)
        A_0_sample = A_0[:,idx]
        Y_sample = Y[:,idx]
        z_1, A_1, z_2, A_2 = forward_prop(A_0_sample, W_1, b_1, W_2, b_2)
        W_1, b_1, W_2, b_2 = back_prop(A_0_sample, A_1, A_2, z_1, 
                                        Y_sample, W_1, b_1, W_2, b_2)
        acc = return_acc(A_2,Y_sample)
        print(f"Epoch {i} training accuracy: {acc}")
    return W_1, b_1, W_2, b_2

In [49]:
# train using SGD
W_1, b_1, W_2, b_2 = stochastic_gradient_descent(A_0, y_train, W_1, b_1, W_2, b_2, 10)
# lets do some testing
input = np.asarray([np.reshape(np.asarray(x),[784,]) for x in x_test.T]).T
labels = np.asarray([one_hot_Y(y) for y in y_test]).T
_,_,_,preds = forward_prop(input,W_1,b_1,W_2,b_2)
acc = return_acc(preds,labels)
print("~"*30)
print("Testing Accuracy: ",acc)

Epoch 0 training accuracy: 0.495
Epoch 1 training accuracy: 0.476
Epoch 2 training accuracy: 0.488
Epoch 3 training accuracy: 0.45
Epoch 4 training accuracy: 0.462
Epoch 5 training accuracy: 0.497
Epoch 6 training accuracy: 0.458
Epoch 7 training accuracy: 0.495
Epoch 8 training accuracy: 0.506
Epoch 9 training accuracy: 0.466
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Testing Accuracy:  0.4901


Nice.