In [68]:
import numpy as np
from keras.datasets import mnist

np.random.seed(42)

In [69]:
# load data, assign test and training set
(x_train, y_train), (x_test, y_test) = mnist.load_data()


In [91]:
# One-Hot Encode Vector

def one_hot_encode(y, num_classes):
    if y.ndim > 1 and y.shape[1] == num_classes: # Already one-hot encoded

        return y
    y = y.astype(int)
    one_hot = np.zeros((y.shape[0], num_classes))
    one_hot[np.arange(y.shape[0]), y] = 1
    return one_hot

x_train = x_train.reshape(-1, 28*28) / 255.0
x_test = x_test.reshape(-1, 28*28) / 255.0

y_train = one_hot_encode(y_train, 10)
y_test = one_hot_encode(y_test, 10)


print(f"Training samples: {x_train.shape[0]}, Test samples: {x_test.shape[0]}")


Training samples: 60000, Test samples: 10000


In [71]:
# Initialization and Forward Propagation set up

def initialize_parameters(input_size, hidden_size, output_size):
    W1 = np.random.randn(input_size, hidden_size) * 0.01
    b1 = np.zeros((1, hidden_size))

    W2 = np.random.randn(hidden_size, output_size) * 0.01
    b2 = np.zeros((1, output_size))

    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(0, Z)

def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)

def forward_prop(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = ReLU(Z1)

    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)

    return A1, A2



In [88]:
# Cross Entropy Loss

def compute_loss(A2, Y):
    m = Y.shape[0]
    loss = -np.sum(Y * np.log(A2)) / m
    return loss

# Backward Propagation

def backward_prop(X, Y, A1, A2, W1, W2):
    n = X.shape[0]

    # Output layer (Layer 2)
    dZ2 = A2 - Y
    dW2 = np.dot(A1.T, dZ2) / n
    db2 = np.sum(dZ2, axis=0, keepdims=True) / n

    # Hidden layer (Layer 1)
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * (A1 > 0)
    dW1 = np.dot(X.T, dZ1) / n
    db1 = np.sum(dZ1, axis=0, keepdims=True) / n

    gradients = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return gradients


# Gradient Descent

def update_parameters(W1, b1, W2, b2, gradients, learning_rate):
    W1 -= learning_rate * gradients["dW1"]
    b1 -= learning_rate * gradients["db1"]
    W2 -= learning_rate * gradients["dW2"]
    b2 -= learning_rate * gradients["db2"]

    return W1, b1, W2, b2

# Showdown Training Function
def train(X_train, Y_train, epochs, learning_rate, batch_size):
    input_size = X_train.shape[1]   # 784
    output_size = Y_train.shape[1]  # 10
    hidden_size = 128

    # Initialize parameters
    W1, b1, W2, b2 = initialize_parameters(input_size, hidden_size, output_size)

    for epoch in range(epochs):
        epoch_loss = 0

        # Mini-batch gradient descent
        for i in range(0, X_train.shape[0], batch_size):
            X_batch = X_train[i:i+batch_size]
            Y_batch = Y_train[i:i+batch_size]

            # Forward propagation
            A1, A2 = forward_prop(X_batch, W1, b1, W2, b2)

            # Compute loss
            loss = compute_loss(A2, Y_batch)
            epoch_loss += loss

            # Backpropagation
            gradients = backward_prop(X_batch, Y_batch, A1, A2, W1, W2)

            # Update parameters
            W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, gradients, learning_rate)

        # Print the loss after 10th cycle
        if epoch % 10 == 0:
          epoch_loss /= (X_train.shape[0] // batch_size)
          print(f"Epoch {epoch + 1}, Loss: {epoch_loss}")



    return W1, b1, W2, b2


W1, b1, W2, b2 = train(x_train, y_train, epochs=100, learning_rate=0.1, batch_size=128)



Epoch 1, Loss: 0.7621555852758589
Epoch 11, Loss: 0.1085190529670415
Epoch 21, Loss: 0.05917797491991317
Epoch 31, Loss: 0.03796906332325583
Epoch 41, Loss: 0.02588089469058749
Epoch 51, Loss: 0.018355095971199553
Epoch 61, Loss: 0.013445087197368744
Epoch 71, Loss: 0.01017335552564088
Epoch 81, Loss: 0.007903629135833482
Epoch 91, Loss: 0.006295570847423308


In [89]:
def predict(X, W1, b1, W2, b2):
    _, A2 = forward_prop(X, W1, b1, W2, b2)
    return np.argmax(A2, axis=1)

y_pred = predict(x_test, W1, b1, W2, b2)
y_true = np.argmax(y_test, axis=1)

accuracy = np.mean(y_pred == y_true)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 97.96%
