In [111]:
#imports
import numpy as np
from tensorflow.keras.datasets import mnist

In [112]:
#load data + prepare
(X_train_raw, Y_train), (X_test_raw, Y_test) = mnist.load_data()

# flatten 28x28 to 784 and transpose to (784, m)
X_train = X_train_raw.reshape(X_train_raw.shape[0], -1).T.astype("float32")   # (784, 60000)
X_test  = X_test_raw.reshape(X_test_raw.shape[0], -1).T.astype("float32")     # (784, 10000)
m,n = X_train.shape
#scaling
X_train /=255.0
X_test /=255.0

In [113]:
#helper funcitons

In [114]:
#1.one-hot encoding
# convert given column to one hot encodings
# say... convert labels like [3, 0, 1] ... into
'''[010
001
000
100
000
...
...] '''
# converts (m,) to (10,m) since we get a class(Y true) row, each example in the dataset is tied to a class, so we need to one hot encode it
def one_hot(Y):
    one_hot_matrix = np.zeros((Y.size, Y.max() + 1))
    one_hot_matrix[np.arange(Y.size), Y] = 1
    return one_hot_matrix.T

In [115]:
#2.ReLU
#max(0,x)
def ReLU(Z):
    return np.maximum(0,Z)

In [116]:
#3. derivative of ReLU
def der_ReLU(Z):
    return Z>0

In [117]:
#4. the last output layer gives us a matrix of dimension (10,m) which should be converted to probabilities of same dimensions (10,m), 
# 10 possible classes for each of the m examples
def SoftMax(Z):
    # Z2: (10, m)
    Z = Z - np.max(Z, axis=0, keepdims=True)  #numerical stability
    # If Z has large values, np.exp(Z) can overflow. Subtracting max per column doesnt change softmax, just rescales.
    exp_Z = np.exp(Z)
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

In [118]:
#parameter initializations

In [119]:
'''dimensions of all weights and biases:
    X - 784xm
    w1 - 10x784
    b1 - 10x1
    z1 - 10xm
    w2 - 10x10
    b2 - 10x1
    '''
def init_params():
    w1 = np.random.randn(10,784)*0.01
    b1 = np.zeros((10,1)) # np.random.randn(10,1)
    w2 = np.random.randn(10,10)*0.01
    b2 = np.zeros((10,1)) # np.random.randn(10,1)
    return w1,b1,w2,b2

In [120]:
#forwardprop
def forward_prop(w1,b1,w2,b2,X):
    z1 = np.dot(w1,X) + b1
    a1 = ReLU(z1)
    z2 = np.dot(w2,a1) + b2
    a2 = SoftMax(z2)
    return z1,a1,z2,a2

In [121]:
#backprop
def back_prop(z1,a1,z2,a2,w1,w2,X,Y):
    dz2 = a2 - one_hot(Y)
    dw2 = (1/m)*np.dot(dz2,a1.T)
    db2 = (1/m)*np.sum(dz2,axis=1,keepdims=True)
    da1 = np.dot(w2.T, dz2)
    dz1 = da1 * der_ReLU(z1)
    dw1 = (1/m) * np.dot(dz1, X.T)
    db1 = (1/m) * np.sum(dz1, axis=1, keepdims=True)
    return dw1,db1,dw2,db2

In [122]:
# updating parameters
def update_params(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha):
    w1 = w1-alpha*dw1
    b1 = b1-alpha*db1
    w2 = w2-alpha*dw2
    b2 = b2-alpha*db2
    return w1,b1,w2,b2

In [123]:
# prediction + accuracy
def get_predictions(A2):
    # A2 matrix is of dim (10, m) and gives probabilites of 10 classes for samples
    return np.argmax(A2, axis=0)
def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

In [124]:
# training loop
def gradient_descent(X,Y,alpha,iterations):
    w1,b1,w2,b2 = init_params()
    for i in range(iterations):
        z1,a1,z2,a2 = forward_prop(w1,b1,w2,b2,X)
        if i == 0:
                   dw1,db1,dw2,db2 = back_prop(z1,a1,z2,a2,w1,w2,X,Y)
        w1,b1,w2,b2 = update_params(w1,b1,w2,b2,dw1,db1,dw2,db2,alpha)
        if i % 10 == 0:
            predictions = get_predictions(a2)
            acc = get_accuracy(predictions, Y)
            print("iteration: ", i ," accuracy: ", acc)
    return w1, b1, w2, b2

In [125]:
# finally, train your neural net
X_small = X_train[:, :5000]
Y_small = Y_train[:5000]
w1,b1,w2,b2 = gradient_descent(X_small, Y_small, 0.1, 500)


iteration:  0  accuracy:  0.1498
iteration:  10  accuracy:  0.3922
iteration:  20  accuracy:  0.598
iteration:  30  accuracy:  0.468
iteration:  40  accuracy:  0.5708
iteration:  50  accuracy:  0.7512
iteration:  60  accuracy:  0.7078
iteration:  70  accuracy:  0.8104
iteration:  80  accuracy:  0.8774
iteration:  90  accuracy:  0.8354
iteration:  100  accuracy:  0.8958
iteration:  110  accuracy:  0.813
iteration:  120  accuracy:  0.912
iteration:  130  accuracy:  0.916
iteration:  140  accuracy:  0.9174
iteration:  150  accuracy:  0.9018
iteration:  160  accuracy:  0.9134
iteration:  170  accuracy:  0.919
iteration:  180  accuracy:  0.9224
iteration:  190  accuracy:  0.9278
iteration:  200  accuracy:  0.9298
iteration:  210  accuracy:  0.9318
iteration:  220  accuracy:  0.934
iteration:  230  accuracy:  0.9378
iteration:  240  accuracy:  0.9342
iteration:  250  accuracy:  0.919
iteration:  260  accuracy:  0.936
iteration:  270  accuracy:  0.9414
iteration:  280  accuracy:  0.9396
iter