In [1]:
import numpy as np

In [55]:
import numpy as np
import tensorflow as tf

def relu(x):
    return np.maximum(0, x)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

def relu_derivative(x):
    return (x > 0).astype(float)

class TwoLayerNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.params = {
            'W1': np.random.randn(input_size, hidden_size) * 0.01,
            'b1': np.zeros((1, hidden_size)),
            'W2': np.random.randn(hidden_size, output_size) * 0.01,
            'b2': np.zeros((1, output_size))
        }

    def forward(self, X):
        z1 = np.dot(X, self.params['W1']) + self.params['b1']
        a1 = relu(z1)
        z2 = np.dot(a1, self.params['W2']) + self.params['b2']
        a2 = softmax(z2)
        return a1, a2

    def compute_loss(self, Y, output):
        m = Y.shape[0]
        log_likelihood = -np.log(output[range(m), Y])
        loss = np.sum(log_likelihood) / m
        return loss

    def backward(self, X, Y, a1, output):
        m = X.shape[0]
        
        delta3 = output
        delta3[range(m), Y] -= 1
        dW2 = np.dot(a1.T, delta3) / m
        db2 = np.sum(delta3, axis=0) / m

        delta2 = np.dot(delta3, self.params['W2'].T) * relu_derivative(a1)
        dW1 = np.dot(X.T, delta2) / m
        db1 = np.sum(delta2, axis=0) / m

        grads = {
            'W1': dW1, 'b1': db1,
            'W2': dW2, 'b2': db2
        }

        return grads

    def update_params(self, grads, learning_rate):
        for key in self.params:
            self.params[key] -= learning_rate * grads[key]

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            a1, output = self.forward(X)
            loss = self.compute_loss(Y, output)
            grads = self.backward(X, Y, a1, output)
            self.update_params(grads, learning_rate)
            if epoch % 100 == 0:
                print("Epoch:", epoch, "Loss:", loss)

# Load and preprocess MNIST data
def load_mnist():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
    x_train = x_train.reshape(-1, 784).astype('float32') / 255  # Flatten and normalize
    x_test = x_test.reshape(-1, 784).astype('float32') / 255
    return x_train, y_train, x_test, y_test

if __name__ == "__main__":
    input_size = 784  # For MNIST dataset (28x28 images)
    hidden_size = 128  # Size of the hidden layer
    output_size = 10   # Number of classes in MNIST (0-9)

    # Load MNIST data
    X_train, Y_train, X_test, Y_test = load_mnist()

    nn = TwoLayerNN(input_size, hidden_size, output_size)
    nn.train(X_train, Y_train, epochs=1000, learning_rate=0.1)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Epoch: 0 Loss: 2.303128710227561
Epoch: 100 Loss: 0.9506855801160495
Epoch: 200 Loss: 0.5032085995233072
Epoch: 300 Loss: 0.4050727488695916
Epoch: 400 Loss: 0.36285651466103924
Epoch: 500 Loss: 0.3381541061379601
Epoch: 600 Loss: 0.3208366637227774
Epoch: 700 Loss: 0.307216747956839
Epoch: 800 Loss: 0.2956246972406093
Epoch: 900 Loss: 0.2852147570609682
