# Fully Connected Neural Network from Scratch (NumPy Only)

PyTorch is used only to load MNIST dataset.

## Import Libraries

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
from torchvision import transforms


## Load MNIST Dataset

In [None]:

transform = transforms.ToTensor()

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
val_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)

print(len(train_dataset), len(val_dataset))


## Convert Torch Tensor to NumPy

In [None]:

def torch_to_numpy(loader):
    X_list, y_list = [], []
    for images, labels in loader:
        images = images.cpu()
        labels = labels.cpu()
        X_list.append(images.numpy().reshape(images.shape[0], -1))
        y_list.append(labels.numpy())
    return np.vstack(X_list), np.hstack(y_list)

X_train, y_train = torch_to_numpy(train_loader)
X_val, y_val = torch_to_numpy(val_loader)

X_train = X_train / 255.0
X_val = X_val / 255.0

def one_hot(y, c=10):
    out = np.zeros((y.size, c))
    out[np.arange(y.size), y] = 1
    return out

y_train_oh = one_hot(y_train)
y_val_oh = one_hot(y_val)


## Activation Functions

In [None]:

def relu(z): return np.maximum(0, z)
def relu_derivative(z): return (z > 0).astype(float)

def sigmoid(z): return 1/(1+np.exp(-z))
def sigmoid_derivative(z): 
    s = sigmoid(z)
    return s*(1-s)

def tanh(z): return np.tanh(z)
def tanh_derivative(z): return 1-np.tanh(z)**2

def softmax(z):
    e = np.exp(z - np.max(z, axis=1, keepdims=True))
    return e / np.sum(e, axis=1, keepdims=True)


## Neural Network Class

In [None]:

class NeuralNetwork:
    def __init__(self, layers, activation='relu', lr=0.01):
        self.layers = layers
        self.lr = lr
        self.activation = activation
        self.params = {}
        self.cache = {}
        self.init_params()
        
    def init_params(self):
        for i in range(len(self.layers)-1):
            self.params['W'+str(i)] = np.random.randn(self.layers[i], self.layers[i+1])*0.01
            self.params['b'+str(i)] = np.zeros((1, self.layers[i+1]))
    
    def activate(self, z):
        return relu(z) if self.activation=='relu' else sigmoid(z) if self.activation=='sigmoid' else tanh(z)
    
    def activate_derivative(self, z):
        return relu_derivative(z) if self.activation=='relu' else sigmoid_derivative(z) if self.activation=='sigmoid' else tanh_derivative(z)
    
    def forward(self, X):
        self.cache['A0'] = X
        L = len(self.layers)-1
        
        for i in range(L-1):
            Z = self.cache['A'+str(i)] @ self.params['W'+str(i)] + self.params['b'+str(i)]
            A = self.activate(Z)
            self.cache['Z'+str(i+1)] = Z
            self.cache['A'+str(i+1)] = A
        
        ZL = self.cache['A'+str(L-1)] @ self.params['W'+str(L-1)] + self.params['b'+str(L-1)]
        AL = softmax(ZL)
        self.cache['A'+str(L)] = AL
        return AL
    
    def compute_loss(self, Y_pred, Y_true):
        eps = 1e-9
        return -np.mean(np.sum(Y_true*np.log(Y_pred+eps), axis=1))
    
    def backward(self, Y_true):
        grads = {}
        L = len(self.layers)-1
        m = Y_true.shape[0]
        
        dZ = self.cache['A'+str(L)] - Y_true
        
        for i in reversed(range(L)):
            A_prev = self.cache['A'+str(i)]
            grads['dW'+str(i)] = A_prev.T @ dZ / m
            grads['db'+str(i)] = np.sum(dZ, axis=0, keepdims=True) / m
            
            if i > 0:
                dA = dZ @ self.params['W'+str(i)].T
                dZ = dA * self.activate_derivative(self.cache['Z'+str(i)])
        
        self.grads = grads
    
    def update_parameters(self):
        for k in self.params:
            self.params[k] -= self.lr * self.grads['d'+k]
    
    def predict(self, X):
        return np.argmax(self.forward(X), axis=1)
    
    def evaluate(self, X, Y):
        return np.mean(self.predict(X) == np.argmax(Y, axis=1))


## Training

In [None]:

def train(model, Xtr, Ytr, Xv, Yv, epochs=5, batch_size=128):
    hist = {'loss':[], 'val_acc':[]}
    n = Xtr.shape[0]
    
    for e in range(epochs):
        perm = np.random.permutation(n)
        Xtr, Ytr = Xtr[perm], Ytr[perm]
        
        for i in range(0,n,batch_size):
            xb = Xtr[i:i+batch_size]
            yb = Ytr[i:i+batch_size]
            pred = model.forward(xb)
            model.backward(yb)
            model.update_parameters()
        
        loss = model.compute_loss(model.forward(Xtr), Ytr)
        acc = model.evaluate(Xv, Yv)
        hist['loss'].append(loss)
        hist['val_acc'].append(acc)
        print(f"Epoch {e+1} Loss={loss:.4f} ValAcc={acc:.4f}")
    return hist

model = NeuralNetwork([784,128,64,10], activation='relu', lr=0.1)
history = train(model, X_train, y_train_oh, X_val, y_val_oh)


## Plot Results

In [None]:

plt.figure()
plt.plot(history['loss'])
plt.title("Training Loss")
plt.show()

plt.figure()
plt.plot(history['val_acc'])
plt.title("Validation Accuracy")
plt.show()
