In [90]:
from mnist import MNIST
import numpy as np
import matplotlib.pyplot as plt

In [91]:
mndata = MNIST('/Users/czh/Documents/coding/ml/digit classification/archive')
images, labels = mndata.load_training()

In [92]:
#helper functions

def ReLU(x):
    return np.maximum(0,x)

def ReLU_derivative(x):
    return (x > 0).astype(float)

def softmax(x, axis=None):
    x_max = np.max(x, axis=axis, keepdims=True)
    e_x = np.exp(x - x_max)
    
    return e_x / e_x.sum(axis=axis, keepdims=True)

def forward_pass(x):
    z1 = x @ weights['W1'] + weights['b1']
    a1 = ReLU(z1)

    z2 = a1 @ weights['W2'] + weights['b2']
    a2 = ReLU(z2)

    z3 = a2 @ weights['W3'] + weights['b3']
    output = softmax(z3)
    return z1, a1, z2, a2, z3, output

def cross_entropy(predictions, true_label):
    epsilon = 1e-10 # small constant to avoid log(0)
    return -np.log(predictions[true_label] + epsilon)

def he_init(fan_in, fan_out):
    return np.random.randn(fan_in, fan_out) * np.sqrt(2.0 / fan_in)


In [93]:
def backward_pass(x, true_label, output, z1, a1, z2, a2, z3):
    # encode true label
    one_hot = np.zeros(10)
    one_hot[true_label] = 1
    
    # Layer 3
    dL_dz3 = output - one_hot
    dL_dW3 = a2.reshape(-1, 1) @ dL_dz3.reshape(1, -1)
    dL_db3 = dL_dz3

    # Layer 2
    dL_da2 = dL_dz3 @ weights['W3'].T
    dL_dz2 = dL_da2 * ReLU_derivative(z2)
    dL_dW2 = a1.reshape(-1, 1) @ dL_dz2.reshape(1, -1)
    dL_db2 = dL_dz2

    # Layer 1
    dL_da1 = dL_dz2 @ weights['W2'].T
    dL_dz1 = dL_da1 * ReLU_derivative(z1)
    dL_dW1 = x.reshape(-1, 1) @ dL_dz1.reshape(1, -1)
    dL_db1 = dL_dz1

    return dL_dW1, dL_db1, dL_dW2, dL_db2, dL_dW3, dL_db3

def update_weights(learning_rate, gradients):
    dL_dW1, dL_db1, dL_dW2, dL_db2, dL_dW3, dL_db3 = gradients
    
    weights['W1'] = weights['W1'] - learning_rate * dL_dW1
    weights['b1'] = weights['b1'] - learning_rate * dL_db1
    weights['W2'] = weights['W2'] - learning_rate * dL_dW2
    weights['b2'] = weights['b2'] - learning_rate * dL_db2
    weights['W3'] = weights['W3'] - learning_rate * dL_dW3
    weights['b3'] = weights['b3'] - learning_rate * dL_db3


In [94]:
#init weights

weights = {
    'W1': he_init(784, 16),
    'b1': np.zeros(16),
    'W2': he_init(16, 16),
    'b2': np.zeros(16),
    'W3': he_init(16, 10),
    'b3': np.zeros(10)
}

#training

learning_rate = 0.01
epochs = 10

for i in range(epochs):
    correct = 0
    total_loss = 0

    for j in range(len(images)):
        img = np.array(images[j], dtype=np.float32) / 255.0
        label = labels[j]

        # forward pass
        z1, a1, z2, a2, z3, output = forward_pass(img)

        # calculate loss
        loss = cross_entropy(output, label)
        total_loss += loss

        # backward pass
        gradients = backward_pass(img, label, output, z1, a1, z2, a2, z3)

        # update weights
        update_weights(learning_rate, gradients)

        # track accuracy
        if np.argmax(output) == label:
            correct += 1
    
    # print progress after each epoch
    accuracy = (correct / len(images)) * 100
    avg_loss = total_loss / len(images)
    print(f"Epoch {i+1}: Accuracy = {accuracy:.2f}%, Loss = {avg_loss:.4f}")


Epoch 1: Accuracy = 89.92%, Loss = 0.3417
Epoch 2: Accuracy = 93.28%, Loss = 0.2322
Epoch 3: Accuracy = 93.95%, Loss = 0.2059
Epoch 4: Accuracy = 94.43%, Loss = 0.1933
Epoch 5: Accuracy = 94.71%, Loss = 0.1822
Epoch 6: Accuracy = 94.95%, Loss = 0.1723
Epoch 7: Accuracy = 95.18%, Loss = 0.1665
Epoch 8: Accuracy = 95.33%, Loss = 0.1602
Epoch 9: Accuracy = 95.42%, Loss = 0.1566
Epoch 10: Accuracy = 95.59%, Loss = 0.1534
