# Installing Libraries (Python version >= 3.8)

In [None]:
import sys
version = sys.version_info
print(version)
assert version.major == 3 and version.minor >= 8

In [None]:
!python -m pip install -U numpy==1.23.5 pandas==1.5.3 scikit-learn==1.2.2 matplotlib==3.7.4

# Downloading/Visualizing Dataset

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, model_selection

dataset = datasets.fetch_openml("mnist_784", version=1, parser="auto")
X = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
y = pd.Series(data=dataset.target, name="target")

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
plt.imshow(X.iloc[0].values.reshape(28, 28), cmap="gray")  # 1x784 => 28x28 image of the first sample in the dataset
plt.show()

# Preprocessing Data

In [None]:
X_train, X_test = np.array(X_train, dtype="float32") / 255.0, np.array(X_test, dtype="float32") / 255.0  # Normalizing pixel values
y_train, y_test = np.array(pd.get_dummies(y_train), dtype="int32"), np.array(pd.get_dummies(y_test), dtype="int32")  # One-hot encoding target values

# Training Model

In [None]:
from IPython.display import Image

# Retrieved from https://scikit-learn.org/stable/modules/neural_networks_supervised.html
Image(url="https://github.com/esakik/machine-learning-nutsnbolts/assets/44774033/663a9570-12c7-49e4-92ad-2a15c76d9b2d")

In [None]:
class NeuralNetwork:
    """Simple Neural Network with 1 hidden layer."""

    def __init__(self, input_size: int, hidden_size: int, output_size: int, weight_init_std: float = 0.01) -> None:
        """Initialize weights and biases.

        :param input_size: The number of input neurons
        :param hidden_size: The number of hidden neurons
        :param output_size: The number of output neurons
        :param weight_init_std: The standard deviation of the random weights
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.weight_init_std = weight_init_std

        self.W1 = self.weight_init_std * np.random.randn(self.input_size, self.hidden_size)  # The first layer's weights (input_size x hidden_size)
        self.b1 = np.zeros(self.hidden_size)  # The first layer's biases (hidden_size)
        self.W2 = self.weight_init_std * np.random.randn(self.hidden_size, self.output_size)  # The second layer's weights (hidden_size x output_size)
        self.b2 = np.zeros(self.output_size)  # The second layer's biases (output_size)

        self.train_loss_history = []
        self.train_acc_history = []
        self.test_acc_history = []

    def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 1, batch_size: int = 100, learning_rate: float = 0.1) -> None:
        """Train the model. The model uses mini-batch Gradient Descent to update the weights and biases.
        Since the model does not use backpropagation, the training process is not efficient.

        :param X: The input data
        :param y: The target data
        :param epochs: The number of training iterations
        :param batch_size: The number of samples to use in each training iteration
        :param learning_rate: The learning rate
        """
        print(f"Training the model for {epochs} epochs with a batch size of {batch_size} and a learning rate of {learning_rate}")
        
        iters_per_epoch = max(int(X.shape[0] / batch_size), 1)
        print(f"Number of iterations per epoch: {iters_per_epoch}")

        for i_epoch in range(1, epochs + 1):
            for i_iter in range(1, iters_per_epoch + 1):
                # Mini-batch
                indices = np.random.choice(X.shape[0], batch_size)
                X_batch, y_batch = X[indices], y[indices]
    
                # Forward pass (Prediction)
                loss = self.loss(X_batch, y_batch)
                self.train_loss_history.append(loss)
                print(f"Epoch {i_epoch}/{epochs} - Iteration {i_iter}/{iters_per_epoch} - Loss: {loss}")
    
                # Backward pass (Gradient Descent to update weights and biases)
                self.W1 = self.W1 - learning_rate * self.gradient_descent(lambda W: self.loss(X_batch, y_batch), self.W1)
                self.b1 = self.b1 - learning_rate * self.gradient_descent(lambda b: self.loss(X_batch, y_batch), self.b1)
                self.W2 = self.W2 - learning_rate * self.gradient_descent(lambda W: self.loss(X_batch, y_batch), self.W2)
                self.b2 = self.b2 - learning_rate * self.gradient_descent(lambda b: self.loss(X_batch, y_batch), self.b2)

            # Train and test accuracy
            train_acc = self.accuracy(X_train, y_train)
            test_acc = self.accuracy(X_test, y_test)
            self.train_acc_history.append(train_acc)
            self.test_acc_history.append(test_acc)
            print(f"Epoch {i_epoch}/{epochs} - Train accuracy: {train_acc}")
            print(f"Epoch {i_epoch}/{epochs} - Test accuracy: {test_acc}")

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict the target data.

        :param X: The input data
        :return: The predicted target data
        """
        a1 = np.dot(X, self.W1) + self.b1
        z1 = self.sigmoid(a1)
        a2 = np.dot(z1, self.W2) + self.b2
        z2 = self.softmax(a2)
        return z2

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        """The sigmoid function.

        :param x: The input data
        :return: The sigmoid of the input data
        """
        return 1 / (1 + np.exp(-x))

    def softmax(self, x: np.ndarray) -> np.ndarray:
        """The softmax function.

        :param x: The input data
        :return: The softmax of the input data
        """
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x)

    def loss(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Calculate the loss.

        :param X: The input data
        :param y: The target data
        :return: The loss
        """
        y_pred = self.predict(X)
        return self.cross_entropy_error(y_pred, y)

    def cross_entropy_error(self, y_pred: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Calculate the cross-entropy error.

        :param y_pred: The predicted data
        :param y: The target data
        :return: The cross-entropy error
        """
        delta = 1e-7
        return -np.sum(y * np.log(y_pred + delta))

    def accuracy(self, y_pred: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Calculate the accuracy.

        :param y_pred: The predicted data
        :param y: The target data
        :return: The accuracy
        """
        return np.mean(np.argmax(y_pred, axis=1) == np.argmax(y, axis=1))

    def gradient_descent(self, f: callable, X: np.ndarray) -> np.ndarray:
        """Calculate the gradient using the finite difference method.

        :param f: The function to differentiate
        :param X: The input data
        :return: The gradient
        """
        if X.ndim == 1:
            return self.gradient_descent_1d(f, X)
        else:
            grad = np.zeros_like(X)
            for idx, x in enumerate(X):
                grad[idx] = self.gradient_descent_1d(f, x)
            return grad

    def gradient_descent_1d(self, f: callable, x: np.ndarray) -> np.ndarray:
        """Calculate the gradient using the finite difference method. This method is only for 1D input data.

        :param f: The function to differentiate
        :param x: The input data
        :return: The gradient
        """
        h = 1e-4
        grad = np.zeros_like(x)

        for idx in range(x.size):
            tmp_val = x[idx]

            # f(x+h)
            x[idx] = float(tmp_val) + h
            fxh1 = f(x)

            # f(x-h)
            x[idx] = tmp_val - h
            fxh2 = f(x)

            # Derivative
            grad[idx] = (fxh1 - fxh2) / (2 * h)

            x[idx] = tmp_val

        return grad

In [None]:
model = NeuralNetwork(input_size=784, hidden_size=50, output_size=10)
model.fit(X_train, y_train, epochs=1000, batch_size=100, learning_rate=0.01)

# Plot the Training Loss

In [None]:
plt.plot(model.train_loss_history)
plt.xlabel("Iteration")
plt.ylabel("Training Loss")
plt.title("Training Loss")
plt.show()

# Plot the Training and Test Accuracy

In [None]:
fig = plt.figure(figsize=(15, 5))

ax = fig.add_subplot(1, 2, 1)
ax.set_title("Training Accuracy")
ax.set_ylabel("Training Accuracy")
ax.set_xlabel("Epoch")
ax.plot(model.train_acc_history, color="b")

ax = fig.add_subplot(1, 2, 2)
ax.set_title("Test Accuracy")
ax.set_ylabel("Test Accuracy")
ax.set_xlabel("Epoch")
ax.plot(model.test_acc_history, color="r")

plt.show()