In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torchvision

In [2]:
# Load the dataset
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=torchvision.transforms.ToTensor(), download=True)

In [3]:
class LinearLayer:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
        self.bias = np.zeros(output_size)
        self.input = None
        self.output = None


    def forward(self, input):
        self.input = input
        self.output = np.dot(input, self.weights) + self.bias
        return self.output

    # Backward pass of the layer
    def backward(self, grad_output):
        grad_input = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(self.input.T, grad_output)
        grad_bias = np.sum(grad_output, axis=0)
        return grad_input, grad_weights, grad_bias

    # Update the weights and bias based on the outcome of the backward pass
    def update(self, grad_weights, grad_bias, learning_rate):
        self.weights -= learning_rate * grad_weights
        self.bias -= learning_rate * grad_bias


In [4]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

def binary_cross_entropy(y_pred, y_true):
    return -np.sum(y_true * np.log(y_pred + 1e-8) + (1 - y_true) * np.log(1 - y_pred + 1e-8), axis=1, keepdims=True)

def sigmoid(x):
    """
    Sigmoid activation function
    :param x: input
    :return sigmoid(x)
    """
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.maximum(x, 0)

def relu_derivative(x):
    return (x > 0).astype(float)

def cross_entropy_loss(y_pred, y_true):
    return -np.sum(y_true * np.log(y_pred + 1e-8), axis=1, keepdims=True)

def binary_cross_entropy_derivative(y_pred, y_true):
    return -y_true / y_pred + (1 - y_true) / (1 - y_pred)

def one_hot_encode(Y):
    """
    Converts a vector of integers into a one-hot matrix.
    :param Y: A vector of shape (1, m)
    :return: A one-hot matrix of shape (m, n_classes)

    Example:
    >>> one_hot_encode(np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
    array([ [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
            [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
            [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
            ...,
            ]
    """
    one_hot = np.zeros((Y.size, Y.max() + 1))
    one_hot[np.arange(Y.size), Y] = 1
    return one_hot.T


In [5]:
class Network:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_layer = LinearLayer(input_size, hidden_size)
        # todo check if important
        # self.hidden_layer = LinearLayer(hidden_size, hidden_size)

        self.output_layer = LinearLayer(hidden_size, output_size)

    def forward(self, X):
        Z1 = self.input_layer.forward(X)
        A1 = relu(Z1)

        # A2 = self.hidden_layer.forward(A1)
        # Z2 = relu()

        Z2 = self.output_layer.forward(A1)
        A2 = softmax(Z2)
        return Z1, A1, Z2, A2

    def backward(self, Z1, A1, Z2, A2, W2, X, Y):
        m = Y.shape[0]
        one_hot_Y = one_hot_encode(Y)

        dZ2 = A2 - one_hot_Y # loss function
        dW2 = 1 / m * dZ2.dot(A1.T)
        db2 = 1 / m * np.sum(dZ2)

        dZ1 = W2.T.dot(dZ2) * relu_derivative(Z1)
        dW1 = 1 / m * dZ1.dot(X.T)
        db1 = 1 / m * np.sum(dZ1)

    def update(self, learning_rate, grad_weights, grad_bias):
        self.input_layer.update(grad_weights, grad_bias, learning_rate)
        # self.hidden_layer.update(grad_weights, grad_bias, learning_rate)
        self.output_layer.update(grad_weights, grad_bias, learning_rate)

    def train(self, X, y, learning_rate, epochs):
        for epoch in range(epochs):
            y_pred = self.forward(X)
            hot_one = np.zeros((1, 10))
            hot_one[0][y] = 1
            loss = cross_entropy_loss(hot_one, y_pred)
            print(f"Epoch: {epoch}, Loss: {np.mean(loss)}")
            grad_input, grad_weights, grad_bias = self.backward(y_pred, y)
            self.update(learning_rate, grad_weights, grad_bias)


In [6]:
X_train = train_dataset.data.numpy().reshape(-1, 784) / 255
X_train = X_train.T
y_train = train_dataset.targets.numpy()

# X_train_4 which only contains 4s
X_train_4 = X_train[:, y_train == 4]
y_train_4 = y_train[y_train == 4]




X_test = test_dataset.data.numpy().reshape(-1, 28*28) / 255
X_test = X_test.T
y_test = test_dataset.targets.numpy()

In [7]:
# train
nn = Network(28*28, 128, 10)
nn.train(X_train, y_train, 0.05, 10)

ValueError: shapes (784,60000) and (784,128) not aligned: 60000 (dim 1) != 784 (dim 0)

In [None]:
# test
y_pred = nn.forward(X_test)
y_pred = np.argmax(y_pred, axis=1)
print(f"Accuracy: {np.mean(y_pred == y_test)}")