In [182]:
import numpy as np

In [183]:
training_data = np.load(f'../fashion_train.npy')
test_data = np.load(f'../fashion_test.npy')

labels = training_data[:, -1]
training_data = training_data[:,:-1] / 255
training_data = np.c_[training_data, labels]

np.unique(training_data[:,-1])

array([0., 1., 2., 3., 4.])

In [184]:
import random
random.seed(42)

In [185]:
# Helper functions
def MSE(y_pred, y):
    return 2 * np.mean([(y_i - y_pred_i)**2 for y_i, y_pred_i in zip(y, y_pred)])

def MSE_derivative(y_pred, y):
    return [(y_i - y_pred_i) for y_i, y_pred_i in zip(y, y_pred)]

def CrossEntropy(y_pred, y):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y * np.log(y_pred), axis=1))

def CrossEntropy_derivative(y_pred, y):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return (y_pred - y) / y.shape[0]  # Vectorized

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def reLU(x):
    return np.maximum(0, x)

def reLU_derivative(x):
    return np.where(x > 0, 1, 0)

def leaky_reLU(x):
    return np.maximum(0.01 * x, x)

def leaky_reLU_derivative(x):
    return np.where(x > 0, 1, 0.01)

def ELU(x):
    return np.maximum(0.01 * (np.exp(x)-1), x)

def ELU_derivative(x):
    return np.where(x > 0, 1, 0.01* np.exp(x))

def softmax(x):
    exp_values = np.exp(x - np.max(x, axis=1, keepdims=True))
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
    return probabilities

def xavier_initialization(input_size, output_size):
    return np.random.randn(input_size, output_size) * np.sqrt(1 / input_size)

def he_initialization(input_size, output_size):
    return np.random.randn(input_size, output_size) * np.sqrt(2 / input_size)


In [186]:
def forward_pass(data, weights, biases, num_hidden_layers, activation_function):
    inputs, outputs = [], []
    current_input = data

    z = np.dot(current_input, weights["input"]) + biases["input"]
    inputs.append(z)
    current_input = activation_function(z)
    outputs.append(current_input)

    for i in range(num_hidden_layers):
        z = np.dot(current_input, weights["hidden"][i]) + biases["hidden"][i]
        inputs.append(z)
        current_input = activation_function(z)
        outputs.append(current_input)

    z = np.dot(current_input, weights["output"]) + biases["output"]
    inputs.append(z)
    current_input = softmax(z)
    outputs.append(current_input)

    return inputs, outputs

def backward_pass(data, labels, weights, biases, inputs, outputs, activation_function_derivative, cost_derivative, learning_rate):
    # Calculate delta for output layer
    delta = cost_derivative(outputs[-1], labels) * activation_function_derivative(inputs[-1])

    # Gradients for output layer
    gradient_weights_output = np.dot(outputs[-2].T, delta)  # Last hidden layer's output
    gradient_biases_output = np.sum(delta, axis=0, keepdims=True)

    # Update output layer weights and biases
    weights["output"] -= learning_rate * gradient_weights_output
    biases["output"] -= learning_rate * gradient_biases_output

    num_hidden_layers = len(weights["hidden"])
    # Gradients for hidden layers (backpropagate)
    for i in range(num_hidden_layers-1, -1, -1):
        if i == len(weights["hidden"])-1:
            delta = np.dot(delta, weights["output"].T) * activation_function_derivative(inputs[-2])
        else:
            delta = np.dot(delta, weights["hidden"][i+1].T) * activation_function_derivative(inputs[-2 +  i - num_hidden_layers])

        gradient_weights_hidden = np.dot(outputs[-2 +  i - num_hidden_layers].T, delta)
        gradient_biases_hidden = np.sum(delta, axis=0, keepdims=True)
        # Update hidden layer weights and biases
        weights["hidden"][i] -= learning_rate * gradient_weights_hidden
        biases["hidden"][i] -= learning_rate * gradient_biases_hidden

    delta = np.dot(delta, weights["hidden"][0].T) * activation_function_derivative(inputs[0])
    gradient_weights_input = np.dot(data.T, delta)
    gradient_biases_input = np.sum(delta, axis=0, keepdims=True)

    # Update input layer weights and biases
    weights["input"] -= learning_rate * gradient_weights_input
    biases["input"] -= learning_rate * gradient_biases_input

    return weights, biases

In [187]:
def feedforward(data, num_hidden_layers, learning_rate, epochs, activation_function):

    # Select activation function
    softmax_activation = softmax 

    if activation_function == 'sigmoid':
        initialization = xavier_initialization
        activation_function_ = sigmoid
        activation_function_derivative_ = sigmoid_derivative
        cost_function_ = MSE
        cost_derivative_ = MSE_derivative
    elif activation_function == 'relu':
        initialization = he_initialization
        activation_function_ = reLU
        activation_function_derivative_ = reLU_derivative
        cost_function_ = CrossEntropy
        cost_derivative_ = CrossEntropy_derivative
    elif activation_function == 'leaky_relu':
        initialization = he_initialization
        activation_function_ = leaky_reLU
        activation_function_derivative_ = leaky_reLU_derivative
        cost_function_ = CrossEntropy
        cost_derivative_ = CrossEntropy_derivative
    elif activation_function == 'ELU':
        initialization = he_initialization
        activation_function_ = ELU
        activation_function_derivative_ = ELU_derivative
        cost_function_ = CrossEntropy
        cost_derivative_ = CrossEntropy_derivative
    else:
        raise ValueError("Invalid activation function. Choose 'sigmoid', 'leaky_relu', 'ELU' or 'relu'.")
        
    # Weight initialization
    num_samples, num_features = data.shape[0], data.shape[1]-1
    input_layer_size = num_features
    hidden_layer_size = 16
    # num_features // num_hidden_layers

    output_layer_size = len(np.unique(data[:,-1]))

    weights = {
        "input": initialization(input_layer_size, hidden_layer_size),
        "hidden": np.array([initialization(hidden_layer_size, hidden_layer_size) for _ in range(num_hidden_layers)]),
        "output": initialization(hidden_layer_size, output_layer_size)
    }
    biases = {
        "input": np.zeros((1, hidden_layer_size)),
        "hidden": [np.zeros((1, hidden_layer_size)) for _ in range(num_hidden_layers)],
        "output": np.zeros((1, output_layer_size))
    }

    # -------------------------------------------------------------------------------------------------------------

    labels = data[:, -1]
    data_no_labels = data[:, :-1]
    num_classes = len(np.unique(labels))
    classes = np.eye(num_classes)[labels.astype(int)]

    input, output = forward_pass(data_no_labels, weights, biases, num_hidden_layers,activation_function_)

    for epoch in range(epochs):
        weights, biases = backward_pass(data_no_labels, classes, weights, biases, input, output, activation_function_derivative_, cost_derivative_, learning_rate)

        input, output = forward_pass(data_no_labels, weights, biases, num_hidden_layers, activation_function_)

        if epoch % 10 == 0:
            cost = cost_function_(output[-1], classes)
            print(f"Epoch: {epoch+1}, Cost: {cost}")
    return weights, biases, num_hidden_layers, activation_function_

In [195]:
def predict(data, weights, biases, num_hidden_layers, activation_function):
    data_labels = data[:, -1]
    data_no_labels = data[:, :-1]
    inputs, outputs = forward_pass(data_no_labels, weights, biases, num_hidden_layers, activation_function)
    pred = np.argmax(outputs[-1], axis=1)
    accuracy = (np.sum(pred == data_labels) / len(data_labels)) * 100
    confusion_matrix = np.zeros((len(np.unique(data_labels)), len(np.unique(data_labels))))
    for i in range(len(data_labels)):
        confusion_matrix[data_labels[i]][pred[i]] += 1
    return accuracy, confusion_matrix

In [193]:
weights, biases, num_hidden_layers, activation_function_ = feedforward(training_data, 2, 0.005, 1000, 'leaky_relu')

Epoch: 1, Cost: 1.6314219794817661
Epoch: 11, Cost: 1.617173046212607
Epoch: 21, Cost: 1.6164127395640224
Epoch: 31, Cost: 1.5918563242027293
Epoch: 41, Cost: 1.5590719249777707
Epoch: 51, Cost: 1.5255854275951402
Epoch: 61, Cost: 1.493439502102155
Epoch: 71, Cost: 1.463738931417949
Epoch: 81, Cost: 1.431439395716156
Epoch: 91, Cost: 1.3970394130532375
Epoch: 101, Cost: 1.35652503604993
Epoch: 111, Cost: 1.3138590499982052
Epoch: 121, Cost: 1.274496861645833
Epoch: 131, Cost: 1.2393117494016077
Epoch: 141, Cost: 1.2082263033170744
Epoch: 151, Cost: 1.1798671969760577
Epoch: 161, Cost: 1.1535376265465245
Epoch: 171, Cost: 1.128806974276146
Epoch: 181, Cost: 1.1051927616898185
Epoch: 191, Cost: 1.082729948479877
Epoch: 201, Cost: 1.0618171833619798
Epoch: 211, Cost: 1.0422939874575898
Epoch: 221, Cost: 1.0242874543849168
Epoch: 231, Cost: 1.0077272324556419
Epoch: 241, Cost: 0.9920450607359006
Epoch: 251, Cost: 0.9769865454731844
Epoch: 261, Cost: 0.9622039851425638
Epoch: 271, Cost: 0.9

In [196]:
predict(test_data, weights, biases, num_hidden_layers, activation_function_)

(77.12,
 array([[762.,   4.,  26.,  84., 124.],
        [  3., 927.,  17.,  47.,   6.],
        [  8.,  10., 818.,  15., 149.],
        [ 48.,  20.,   9., 858.,  65.],
        [192.,   3., 266.,  48., 491.]]))