In [1]:
import numpy as np
import torch
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

#### Obtain and reshape training and test data

In [2]:
download_data = True

In [3]:
# Define transform to convert data to tensors
transform = transforms.Compose([transforms.ToTensor()])

# Download MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=download_data, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=download_data, transform=transform)

# Convert to NumPy arrays
x_train = train_dataset.data.numpy()
y_train = train_dataset.targets.numpy()
x_test = test_dataset.data.numpy()
y_test = test_dataset.targets.numpy()

# Reshape the data
x_train_flatten = x_train.reshape(x_train.shape[0],-1).T / 255.  # The "-1" makes reshape flatten the remaining dimensions
x_test_flatten = x_test.reshape(x_test.shape[0],-1).T /255. # regularize data by 1/255
y_train_flatten = y_train.reshape(y_train.T.shape[0],1).T
y_test_flatten = y_test.reshape(y_test.T.shape[0],1).T 

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9912422/9912422 [00:01<00:00, 6364361.17it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28881/28881 [00:00<00:00, 236214.38it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1648877/1648877 [00:00<00:00, 2155077.46it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4542/4542 [00:00<00:00, 1060424.65it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






#### Helper functions

In [4]:
def ReLU(Z) :
    return np.maximum(0,Z)

def deriv_ReLU(Z) :
    return Z > 0
    
def softMAX(Z) :
    exp_z = np.exp(Z - np.max(Z, axis=0, keepdims=True))  # Numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)

def one_hot_(Y) :
    Y = Y.flatten()  # Convert shape (1, m) to (m,)
    one_hot_Y = np.zeros((10, Y.size))
    one_hot_Y[Y, np.arange(Y.size)] = 1  # Fix indexing order
    return one_hot_Y    

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions,Y):
    #print(predictions,Y)
    return np.sum(predictions==Y)/Y.size

#### NN functions

In [5]:
def init_params() :
    W1 = np.random.uniform (-0.15,0.15, (n_hidden_nodes, 784))
    b1 = np.zeros((n_hidden_nodes, 1))
    W2 = np.random.uniform (-0.15,0.15, (10, n_hidden_nodes))
    b2 = np.zeros((10, 1))
    # another way of initializing : W and b taken from a normal distribution
    #W1 = np.random.randn(n_hidden_nodes,784) * 0.01
    #b1 = np.random.randn(n_hidden_nodes,1)
    #W2 = np.random.randn(10,n_hidden_nodes) * 0.01
    #b2 = np.random.randn(10,1)
    return W1, b1, W2, b2

def forward_prop(W1,b1,W2,b2,X) :
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softMAX(Z2)
    return Z1, A1, Z2, A2

def back_prop(Z1, A1, Z2, A2, W2, X, Y) :
    m = Y.size
    one_hot_Y = one_hot_(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha) :
    W1 -= alpha * dW1
    b1 -= alpha * np.reshape(db1,(n_hidden_nodes,1))
    W2 -= alpha * dW2
    b2 -= alpha * np.reshape(db2,(10,1))
    return W1, b1, W2, b2

def gradient_descent(X, Y, X_, Y_, iterations, alpha):
    W1, b1, W2, b2 = init_params()
    iterations+=1
    for i in range(iterations) :
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        # test results:
        _,  _,  _,  A2_= forward_prop(W1, b1, W2, b2, X_)
        if i % 50 == 0 :
            print("Itertation:",i,"Train Accuracy:", get_accuracy(get_predictions(A2),Y))
            print("Itertation:",i,"Test Accuracy:", get_accuracy(get_predictions(A2_),Y_))
    return W1, b1, W2, b2

#### Do actual training

In [6]:
n_hidden_nodes = 100
n_iterations = 200
learning_rate = 0.3

W1, b1, W2, b2 = gradient_descent(x_train_flatten, y_train_flatten, x_test_flatten, y_test_flatten, n_iterations, learning_rate)

Itertation: 0 Train Accuracy: 0.12498333333333334
Itertation: 0 Test Accuracy: 0.2487
Itertation: 50 Train Accuracy: 0.8055833333333333
Itertation: 50 Test Accuracy: 0.8461
Itertation: 100 Train Accuracy: 0.8979666666666667
Itertation: 100 Test Accuracy: 0.9036
Itertation: 150 Train Accuracy: 0.9086
Itertation: 150 Test Accuracy: 0.9151
Itertation: 200 Train Accuracy: 0.91675
Itertation: 200 Test Accuracy: 0.9207
