In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mnist-dataset/train-images.idx3-ubyte
/kaggle/input/mnist-dataset/t10k-labels.idx1-ubyte
/kaggle/input/mnist-dataset/t10k-images.idx3-ubyte
/kaggle/input/mnist-dataset/train-labels.idx1-ubyte
/kaggle/input/mnist-dataset/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte
/kaggle/input/mnist-dataset/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte
/kaggle/input/mnist-dataset/train-labels-idx1-ubyte/train-labels-idx1-ubyte
/kaggle/input/mnist-dataset/train-images-idx3-ubyte/train-images-idx3-ubyte


In [8]:
import numpy as np

def initialize_weights(layer_sizes):
    np.random.seed(42)
    weights = {}
    for i in range(len(layer_sizes) - 1):
        weights[f"W{i+1}"] = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2. / layer_sizes[i])
        weights[f"b{i+1}"] = np.zeros((1, layer_sizes[i+1]))
    return weights

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def softmax(z):   #why
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def forward_propagation(X, weights, num_layers):
    cache = {"A0": X}
    A = X
    for i in range(1, num_layers):
        Z = np.dot(A, weights[f"W{i}"]) + weights[f"b{i}"]
        A = sigmoid(Z)
        cache[f"Z{i}"] = Z
        cache[f"A{i}"] = A

    # Output layer
    Z = np.dot(A, weights[f"W{num_layers}"]) + weights[f"b{num_layers}"]
    A = softmax(Z)
    cache[f"Z{num_layers}"] = Z
    cache[f"A{num_layers}"] = A
    return A, cache

def compute_loss(y_true, y_pred):
    m = y_true.shape[0]
    log_probs = -np.log(y_pred[range(m), y_true])    # 
    loss = np.sum(log_probs) / m
    return loss

def backward_propagation(X, y, weights, cache, num_layers):
    m = X.shape[0]
    grads = {}
    y_onehot = np.zeros_like(cache[f"A{num_layers}"])
    y_onehot[np.arange(m), y] = 1

    # Output layer gradient
    dZ = cache[f"A{num_layers}"] - y_onehot
    grads[f"dW{num_layers}"] = np.dot(cache[f"A{num_layers-1}"].T, dZ) / m
    grads[f"db{num_layers}"] = np.sum(dZ, axis=0, keepdims=True) / m

    # Hidden layers gradients
    for i in range(num_layers - 1, 0, -1):
        dZ = np.dot(dZ, weights[f"W{i+1}"].T) * cache[f"A{i}"] * (1 - cache[f"A{i}"])
        grads[f"dW{i}"] = np.dot(cache[f"A{i-1}"].T, dZ) / m
        grads[f"db{i}"] = np.sum(dZ, axis=0, keepdims=True) / m

    return grads

def update_weights(weights, grads, learning_rate, num_layers):
    for i in range(1, num_layers + 1):
        weights[f"W{i}"] -= learning_rate * grads[f"dW{i}"]
        weights[f"b{i}"] -= learning_rate * grads[f"db{i}"]
    return weights

def train(X, y, layer_sizes, epochs, learning_rate):
    num_layers = len(layer_sizes) - 1
    weights = initialize_weights(layer_sizes)

    for epoch in range(epochs):
        y_pred, cache = forward_propagation(X, weights, num_layers)
        loss = compute_loss(y, y_pred)
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

        grads = backward_propagation(X, y, weights, cache, num_layers)
        weights = update_weights(weights, grads, learning_rate, num_layers)

    return weights

def predict(X, weights, num_layers):
    y_pred, _ = forward_propagation(X, weights, num_layers)
    return np.argmax(y_pred, axis=1)

layer_sizes = [784, 128, 64, 10]

X_train = load_mnist_images('/kaggle/input/mnist-dataset/train-images.idx3-ubyte')
y_train = load_mnist_labels('/kaggle/input/mnist-dataset/train-labels.idx1-ubyte')
X_test = load_mnist_images('/kaggle/input/mnist-dataset/t10k-images.idx3-ubyte')
y_test = load_mnist_labels('/kaggle/input/mnist-dataset/t10k-labels.idx1-ubyte')

weights = train(X_train, y_train, layer_sizes, epochs=300, learning_rate=0.1)

y_pred = predict(X_test, weights, num_layers=len(layer_sizes)-1)
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 20.42%
