In [29]:
import numpy as np  # Import NumPy for numerical operations
from tensorflow.keras.datasets import mnist  # Import MNIST dataset from TensorFlow
from tensorflow.keras.utils import to_categorical  # Import utility for one-hot encoding

# Load MNIST dataset (handwritten digits 0-9)
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize pixel values to the range [0, 1] for better convergence
x_train, x_test = x_train / 255.0, x_test / 255.0

# Reshape input images from (28, 28) to (784,) vectors for the neural network
x_train = x_train.reshape(-1, 28*28)
x_test = x_test.reshape(-1, 28*28)

# Convert class labels (0-9) into one-hot encoded vectors
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Initialize weights and biases using He initialization for better training stability
w1 = np.random.randn(28*28, 128) * np.sqrt(2 / (28*28))  # First layer weights
b1 = np.zeros((1, 128))  # First layer biases
w2 = np.random.randn(128, 10) * np.sqrt(2 / 128)  # Second layer weights
b2 = np.zeros((1, 10))  # Second layer biases

# Define hyperparameters for training
epochs = 400  # Number of training iterations
learning_rate = 0.3  # Step size for weight updates

# Training loop
for epoch in range(epochs):
    # Forward pass: Compute activations for the first hidden layer
    z1 = x_train @ w1 + b1  # Linear transformation
    a1 = np.maximum(0.004 * z1, z1)  # Leaky ReLU activation function

    # Compute activations for the output layer
    z2 = a1 @ w2 + b2  # Linear transformation for output layer
    z2 -= np.max(z2, axis=1, keepdims=True)  # Prevent numerical overflow in softmax
    y_pred = np.exp(z2) / np.sum(np.exp(z2), axis=1, keepdims=True)  # Softmax activation

    # Compute categorical cross-entropy loss (1e-8 added for numerical stability)
    loss = -np.sum(y_train * np.log(y_pred + 1e-8)) / y_train.shape[0]

    # Backpropagation: Compute gradients for the output layer
    dz2 = y_pred - y_train  # Gradient of loss w.r.t. output layer activation
    dw2 = np.dot(a1.T, dz2) / y_train.shape[0]  # Gradient w.r.t. second layer weights
    db2 = np.sum(dz2, axis=0, keepdims=True) / y_train.shape[0]  # Gradient w.r.t. second layer biases

    # Compute gradients for the first hidden layer
    dz1 = np.dot(dz2, w2.T) * ((z1 > 0) + 0.004 * (z1 <= 0))  # Leaky ReLU derivative
    dw1 = np.dot(x_train.T, dz1) / x_train.shape[0]  # Gradient w.r.t. first layer weights
    db1 = np.sum(dz1, axis=0, keepdims=True) / x_train.shape[0]  # Gradient w.r.t. first layer biases

    # Update parameters using gradient descent
    w1 -= learning_rate * dw1
    b1 -= learning_rate * db1
    w2 -= learning_rate * dw2
    b2 -= learning_rate * db2

    # Print training progress every 10 epochs
    if epoch % 10 == 0:
        y_pred_labels = np.argmax(y_pred, axis=1)  # Convert softmax probabilities to class labels
        y_true_labels = np.argmax(y_train, axis=1)  # Extract true labels
        accuracy = np.mean(y_pred_labels == y_true_labels)  # Compute training accuracy
        print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 0, Loss: 2.4231, Accuracy: 0.1176
Epoch 10, Loss: 0.8615, Accuracy: 0.8211
Epoch 20, Loss: 0.6413, Accuracy: 0.8035
Epoch 30, Loss: 0.5032, Accuracy: 0.8613
Epoch 40, Loss: 0.4280, Accuracy: 0.8855
Epoch 50, Loss: 0.3945, Accuracy: 0.8923
Epoch 60, Loss: 0.3720, Accuracy: 0.8971
Epoch 70, Loss: 0.3548, Accuracy: 0.9014
Epoch 80, Loss: 0.3410, Accuracy: 0.9044
Epoch 90, Loss: 0.3295, Accuracy: 0.9076
Epoch 100, Loss: 0.3195, Accuracy: 0.9106
Epoch 110, Loss: 0.3108, Accuracy: 0.9130
Epoch 120, Loss: 0.3029, Accuracy: 0.9151
Epoch 130, Loss: 0.2957, Accuracy: 0.9169
Epoch 140, Loss: 0.2891, Accuracy: 0.9188
Epoch 150, Loss: 0.2830, Accuracy: 0.9207
Epoch 160, Loss: 0.2773, Accuracy: 0.9229
Epoch 170, Loss: 0.2719, Accuracy: 0.9244
Epoch 180, Loss: 0.2668, Accuracy: 0.9256
Epoch 190, Loss: 0.2620, Accuracy: 0.9273
Epoch 200, Loss: 0.2574, Accuracy: 0.9284
Epoch 210, Loss: 0.2530, Accuracy: 0.9295
Epoch 220, Loss: 0.2488, Accuracy: 0.9307
Epoch 230, Loss: 0.2448, Accuracy: 0.9316
Epo

In [33]:
import numpy as np
import matplotlib.pyplot as plt

correct_predictions = 0

for i in range(10000):
    sample = x_test[i].reshape(1, -1)  
    z1 = sample @ w1 + b1
    z1 = np.maximum(0.004 * z1, z1)
    z2 = z1 @ w2 + b2
    y_pred = np.exp(z2) / np.sum(np.exp(z2), axis=1, keepdims=True)
    predicted_label = np.argmax(y_pred)
    true_label = np.argmax(y_test[i])
    
    if predicted_label == true_label:
        correct_predictions += 1

accuracy = (correct_predictions / 10000) * 100
print(f"Accuracy: {accuracy:.2f}%")
print(x_test.shape[0])



Accuracy: 94.63%
10000
