# Installing Libraries (Python version >= 3.8)

In [None]:
import sys
version = sys.version_info
print(version)
assert version.major == 3 and version.minor >= 8

In [None]:
!python -m pip install numpy==1.23.5 pandas==1.5.3 scikit-learn==1.2.2 matplotlib==3.7.4 torch==2.2.0 torchvision==0.17.0

# Implementing Neural Network from Scratch

## Downloading MNIST Dataset

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, model_selection

dataset = datasets.fetch_openml("mnist_784", version=1, parser="auto")
X = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
y = pd.Series(data=dataset.target, name="target")

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
plt.imshow(X.iloc[0].values.reshape(28, 28), cmap="gray")  # 1x784 => 28x28 image of the first sample in the dataset
plt.show()

## Preprocessing Data

In [None]:
X_train, X_test = np.array(X_train, dtype="float32") / 255.0, np.array(X_test, dtype="float32") / 255.0  # Normalizing pixel values
y_train, y_test = np.array(pd.get_dummies(y_train), dtype="int32"), np.array(pd.get_dummies(y_test), dtype="int32")  # One-hot encoding target values

## Training Neural Network without Backpropagation

In [None]:
from IPython.display import Image

# Retrieved from https://scikit-learn.org/stable/modules/neural_networks_supervised.html
Image(url="https://github.com/esakik/machine-learning-nutsnbolts/assets/44774033/663a9570-12c7-49e4-92ad-2a15c76d9b2d")

In [None]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    """The sigmoid function.

    :param x: The input data
    :return: The sigmoid of the input data
    """
    return 1 / (1 + np.exp(-x))

def softmax(x: np.ndarray) -> np.ndarray:
    """The softmax function.

    :param x: The input data
    :return: The softmax of the input data
    """
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x)

def cross_entropy_error(y_pred: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Calculate the cross-entropy error.

    :param y_pred: The predicted data
    :param y: The target data
    :return: The cross-entropy error
    """
    delta = 1e-7
    return -np.sum(y * np.log(y_pred + delta))

def accuracy(y_pred: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Calculate the accuracy.

    :param y_pred: The predicted data
    :param y: The target data
    :return: The accuracy
    """
    return np.mean(np.argmax(y_pred, axis=1) == np.argmax(y, axis=1))


class NeuralNetworkWithoutBackpropagation:
    """Simple Neural Network with 1 hidden layer."""

    def __init__(self, input_size: int, hidden_size: int, output_size: int, weight_init_std: float = 0.01) -> None:
        """Initialize weights and biases.

        :param input_size: The number of input neurons
        :param hidden_size: The number of hidden neurons
        :param output_size: The number of output neurons
        :param weight_init_std: The standard deviation of the random weights
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.weight_init_std = weight_init_std

        self.W1 = self.weight_init_std * np.random.randn(self.input_size, self.hidden_size)  # The first layer's weights (input_size x hidden_size)
        self.b1 = np.zeros(self.hidden_size)  # The first layer's biases (hidden_size)
        self.W2 = self.weight_init_std * np.random.randn(self.hidden_size, self.output_size)  # The second layer's weights (hidden_size x output_size)
        self.b2 = np.zeros(self.output_size)  # The second layer's biases (output_size)

        self.train_loss_history = []
        self.train_acc_history = []

    def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 1, batch_size: int = 100, learning_rate: float = 0.1) -> None:
        """Train the model. The model uses mini-batch Gradient Descent to update the weights and biases.
        Since the model does not use backpropagation, the training process is not efficient.

        :param X: The input data
        :param y: The target data
        :param epochs: The number of training iterations
        :param batch_size: The number of samples to use in each training iteration
        :param learning_rate: The learning rate
        """
        print(f"Training the model for {epochs} epochs with a batch size of {batch_size} and a learning rate of {learning_rate}")

        iters_per_epoch = max(int(X.shape[0] / batch_size), 1)
        print(f"Number of iterations per epoch: {iters_per_epoch}")

        for i_epoch in range(1, epochs + 1):
            for i_iter in range(1, iters_per_epoch + 1):
                # Mini-batch
                indices = np.random.choice(X.shape[0], batch_size)
                X_batch, y_batch = X[indices], y[indices]

                # Forward pass (Prediction)
                loss = self.loss(X_batch, y_batch)
                self.train_loss_history.append(loss)
                # print(f"Epoch {i_epoch}/{epochs} - Iteration {i_iter}/{iters_per_epoch} - Loss: {loss}")

                # Backward pass (Gradient Descent to update weights and biases)
                self.W1 = self.W1 - learning_rate * self.gradient_descent(lambda W: self.loss(X_batch, y_batch), self.W1)
                self.b1 = self.b1 - learning_rate * self.gradient_descent(lambda b: self.loss(X_batch, y_batch), self.b1)
                self.W2 = self.W2 - learning_rate * self.gradient_descent(lambda W: self.loss(X_batch, y_batch), self.W2)
                self.b2 = self.b2 - learning_rate * self.gradient_descent(lambda b: self.loss(X_batch, y_batch), self.b2)

            # Train and test accuracy
            train_acc = accuracy(self.predict(X), y)
            self.train_acc_history.append(train_acc)
            print(f"Epoch {i_epoch}/{epochs} - Train accuracy: {train_acc}")

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict the target data.

        :param X: The input data
        :return: The predicted target data
        """
        a1 = np.dot(X, self.W1) + self.b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, self.W2) + self.b2
        z2 = softmax(a2)
        return z2

    def loss(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Calculate the loss.

        :param X: The input data
        :param y: The target data
        :return: The loss
        """
        y_pred = self.predict(X)
        return cross_entropy_error(y_pred, y)

    def gradient_descent(self, f: callable, X: np.ndarray) -> np.ndarray:
        """Calculate the gradient using the finite difference method.

        :param f: The function to differentiate
        :param X: The input data
        :return: The gradient
        """
        if X.ndim == 1:
            return self.gradient_descent_1d(f, X)
        else:
            grad = np.zeros_like(X)
            for idx, x in enumerate(X):
                grad[idx] = self.gradient_descent_1d(f, x)
            return grad

    def gradient_descent_1d(self, f: callable, x: np.ndarray) -> np.ndarray:
        """Calculate the gradient using the finite difference method. This method is only for 1D input data.

        :param f: The function to differentiate
        :param x: The input data
        :return: The gradient
        """
        h = 1e-4
        grad = np.zeros_like(x)

        for idx in range(x.size):
            tmp_val = x[idx]

            # f(x+h)
            x[idx] = float(tmp_val) + h
            fxh1 = f(x)

            # f(x-h)
            x[idx] = tmp_val - h
            fxh2 = f(x)

            # Derivative
            grad[idx] = (fxh1 - fxh2) / (2 * h)

            x[idx] = tmp_val

        return grad

In [None]:
# The training process is very slow because the model does not use backpropagation
## model = NeuralNetworkWithoutBackpropagation(input_size=784, hidden_size=50, output_size=10)
## model.fit(X_train, y_train, epochs=1000, batch_size=100, learning_rate=0.0001)

## Training Neural Network with Backpropagation

In [None]:
class Relu:
    """ReLU Layer."""

    def __init__(self):
        self.mask = None

    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward propagation.

        :param x: The input data
        :return: The output data
        """
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout: np.ndarray) -> np.ndarray:
        """Backward propagation.

        :param dout: The derivative of the output data
        :return: The derivative of the input data
        """
        dout[self.mask] = 0
        dx = dout

        return dx


class Sigmoid:
    """Sigmoid Layer."""

    def __init__(self):
        self.out = None

    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward propagation.

        :param x: The input data
        :return: The output data
        """
        out = 1 / (1 + np.exp(-x))
        self.out = out

        return out

    def backward(self, dout: np.ndarray) -> np.ndarray:
        """Backward propagation.

        :param dout: The derivative of the output data
        :return: The derivative of the input data
        """
        dx = dout * (1.0 - self.out) * self.out

        return dx


class Affine:
    """Affine Layer."""

    def __init__(self, W: np.ndarray, b: np.ndarray) -> None:
        """Initialize the layer.

        :param W: The weight
        :param b: The bias
        """
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward propagation.

        :param x: The input data
        :return: The output data
        """
        self.x = x
        out = np.dot(x, self.W) + self.b

        return out

    def backward(self, dout: np.ndarray) -> np.ndarray:
        """Backward propagation.

        :param dout: The derivative of the output data
        :return: The derivative of the input data
        """
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx


class SoftmaxWithLoss:
    """Softmax with Loss Layer."""

    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x: np.ndarray, t: np.ndarray) -> np.ndarray:
        """Forward propagation.

        :param x: The input data
        :param t: The target data
        :return: The output data
        """
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

    def backward(self, dout=1) -> np.ndarray:
        """Backward propagation.

        :param dout: The derivative of the output data
        :return: The derivative of the input data
        """
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size

        return dx


class NeuralNetwork:
    """Simple Neural Network with 1 hidden layer."""

    def __init__(self, input_size: int, hidden_size: int, output_size: int, weight_init_std: float = 0.01) -> None:
        """Initialize weights and biases.

        :param input_size: The number of input neurons
        :param hidden_size: The number of hidden neurons
        :param output_size: The number of output neurons
        :param weight_init_std: The standard deviation of the random weights
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.weight_init_std = weight_init_std

        self.params = {
            "W1": self.weight_init_std * np.random.randn(self.input_size, self.hidden_size),  # The first layer's weights (input_size x hidden_size)
            "b1": np.zeros(self.hidden_size),  # The first layer's biases (hidden_size),
            "W2": self.weight_init_std * np.random.randn(self.hidden_size, self.output_size),  # The second layer's weights (hidden_size x output_size)
            "b2": np.zeros(self.output_size),  # The second layer's biases (output_size)
        }

        self.layers = {
            "Affine1": Affine(self.params["W1"], self.params["b1"]),
            "Relu1": Relu(),
            "Affine2": Affine(self.params["W2"], self.params["b2"]),
        }
        self.last_layer = SoftmaxWithLoss()

        self.train_loss_history = []
        self.train_acc_history = []

    def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 100, batch_size: int = 100, learning_rate: float = 0.0001) -> None:
        """Train the model.

        :param X: The input data
        :param y: The target data
        :param epochs: The number of training iterations
        :param batch_size: The number of samples to use in each training iteration
        :param learning_rate: The learning rate
        """
        print(f"Training the model for {epochs} epochs with a batch size of {batch_size} and a learning rate of {learning_rate}")

        iters_per_epoch = max(int(X.shape[0] / batch_size), 1)
        print(f"Number of iterations per epoch: {iters_per_epoch}")

        for i_epoch in range(1, epochs + 1):
            for i_iter in range(1, iters_per_epoch + 1):
                # Mini-batch
                indices = np.random.choice(X.shape[0], batch_size)
                X_batch, y_batch = X[indices], y[indices]

                # Forward pass (Prediction)
                loss = self.loss(X_batch, y_batch)
                self.train_loss_history.append(loss)
                # print(f"Epoch {i_epoch}/{epochs} - Iteration {i_iter}/{iters_per_epoch} - Loss: {loss}")

                # Backward pass (Gradient calculation using backpropagation)
                grad = self.gradient()
                for key in ('W1', 'b1', 'W2', 'b2'):
                    self.params[key] -= learning_rate * grad[key]

            # Train and test accuracy
            train_acc = accuracy(self.predict(X), y)
            self.train_acc_history.append(train_acc)
            print(f"Epoch {i_epoch}/{epochs} - Train accuracy: {train_acc}")

    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict the target data.

        :param X: The input data
        :return: The predicted target data
        """
        out = X.copy()
        for layer in self.layers.values():
            out = layer.forward(out)
        return out

    def loss(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Calculate the loss.

        :param X: The input data
        :param y: The target data
        :return: The loss
        """
        y_pred = self.predict(X)
        return self.last_layer.forward(y_pred, y)

    def gradient(self) -> dict:
        """Calculate the gradients using backpropagation.

        :return: The gradients
        """
        dout = 1
        dout = self.last_layer.backward(dout)

        for layer in reversed(self.layers.values()):
            dout = layer.backward(dout)

        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db

        return grads

In [None]:
model = NeuralNetwork(input_size=784, hidden_size=50, output_size=10)
model.fit(X_train, y_train, epochs=1000, batch_size=100, learning_rate=0.000001)

## Visualizing Training Loss and Accuracy

In [None]:
fig = plt.figure(figsize=(15, 5))

ax = fig.add_subplot(1, 2, 1)
ax.set_title("Training Loss")
ax.set_ylabel("Training Loss")
ax.set_xlabel("Iteration")
ax.plot(model.train_loss_history, color="b")

ax = fig.add_subplot(1, 2, 2)
ax.set_title("Training Accuracy")
ax.set_ylabel("Training Accuracy")
ax.set_xlabel("Epoch")
ax.plot(model.train_acc_history, color="r")

plt.show()

# Implementing Neural Network with PyTorch

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from pathlib import Path

## Downloading FashionMNIST Dataset

In [None]:
work_dir = %pwd
data_dir = Path(work_dir).parents[1] / "datasets"

train_data = datasets.FashionMNIST(root=data_dir, train=True, download=True, transform=ToTensor())
test_data = datasets.FashionMNIST(root=data_dir, train=False, download=True, transform=ToTensor())

In [None]:
batch_size = 64

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

for X, y in test_loader:
    print("Shape of X [N, C, H, W]: ", X.shape)  # `batch_size` samples, 1 channel, 28x28
    print("Shape of y: ", y.shape, y.dtype)  # `batch_size` labels
    break

## Building Neural Network

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class NeuralNetwork(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None:
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork(input_size=28*28, hidden_size=512, output_size=10).to(device)
print(model)

## Optimizing Neural Network

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
def train(
    dataloader: DataLoader,
    model: nn.Module,
    loss_fn: nn.Module,
    optimizer: torch.optim.Optimizer,
) -> None:
    model.train()  # Set the model to training mode
    for i, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)  # Move the data to the device that is used

        # Compute prediction error (Forward pass)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)

        # Backpropagation (Backward pass)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()  # Reset the gradients

        # Display the progress
        if i % 100 == 0:
            loss = loss.item()
            progress = f"{i * len(X):>5d}/{len(dataloader.dataset):>5d}"
            print(f"loss: {loss:>7f}  [{progress}]")

In [None]:
def test(
    dataloader: DataLoader,
    model: nn.Module,
    loss_fn: nn.Module,
) -> None:
    model.eval()  # Set the model to evaluation mode
    test_loss, correct = 0, 0
    with torch.no_grad():  # Do not calculate the gradients
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            test_loss += loss_fn(y_pred, y).item()
            correct += (y_pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= dataloader.batch_size
    correct /= len(dataloader.dataset)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(test_loader, model, loss_fn)
print("Done!")

## Saving Model

In [None]:
model_path = Path(work_dir).parents[1] / "models" / "nn_model.pth"

torch.save(model.state_dict(), model_path)
print(f"Saved PyTorch Model State to {model_path}")

## Loading Model

In [None]:
model = NeuralNetwork(input_size=28*28, hidden_size=512, output_size=10).to(device)
model.load_state_dict(torch.load(model_path))

In [None]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    y_pred = model(x)
    predicted, actual = classes[y_pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')