This note is the record of learning Deep Learning.

The forward and optimizers parts were from [Neural Networks from Scratch in Python](https://nnfs.io).

The backpropagation was from [Prof. Hung-yi Lee @ NTU](https://www.youtube.com/watch?v=ibJpTrp5mcE&list=PLJV_el3uVTsPy9oCRY30oBPNLCo89yu49&index=12).

In [126]:
import numpy as np

# Implement of Neural Network

In [222]:
class Dense_layer:
    """
    Generate a layer of neural network. Note that the outputs 
    are before an activation function (sum(input * weight) + bias).
    """

    def __init__(self, n_inputs: int, n_nodes: int):
        self.weights = 0.01 * np.random.randn(n_inputs, n_nodes)
        self.biases = np.zeros((1, n_nodes))

    def forward(self, inputs: np.array) -> None:
        # The inputs are either data or outputs from the 
        # previous hidden layer Activation(sum(input * weight) + bias).
        self.inputs = inputs
        self.forward_pass = np.dot(inputs, self.weights) + self.biases

    def backprop(self, dvalues: np.array) -> None:
        # dvalues are derivatives from the output of next hidden layer.
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dinputs = np.dot(dvalues, self.weights.T)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

In [217]:
class Activation_ReLU:
    """
    Activation function for hidden layers. 
    Output = max(input, 0)
    """

    def forward(self, inputs: np.array) -> None:
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backprop(self, dvalues: np.array) -> None:
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0 # dReLU == 1 if ReLU > 0

In [211]:
class Softmax_loss:
    """
    should only call forward() and backprop().
    """
    def __init__(self):
        self.softmax_probabilities = None
        self.pred_labels = None
        self.accuracy = None
        self.loss = None
        self.dvalues = None
        self.y_true = None

    def y_true_check(self, y_true: np.array) -> None:
        """make sure y_true is 1D array of labels"""
        if len(y_true.shape) == 2:
            self.y_true = np.argmax(y_true, axis=1)
        else:
            self.y_true = y_true

    def softmax(self, inputs: np.array) -> None:
        # The inputs are from the previous dense layer
        # convert inputs to negative values to 0, preventing overflow
        # the output will not change due to normalization
        # the ouputs is probabilities
        inputs_exp = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        self.softmax_probabilities = inputs_exp / np.sum(inputs_exp, axis=1, keepdims=True)

    def output_labels(self) -> None:
        self.pred_labels = np.argmax(self.softmax_probabilities, axis=1)

    def calculate_accuracy(self) -> None:
        self.accuracy = np.mean(self.pred_labels == self.y_true)

    def cross_entropy(self) -> None:
        """
        Loss = -sum(y_true x log(y_predict))
        This class simplifies the loss to -log(correct_class_condifence): Loss = -log(y_predict))

        y_pred: probabilities from softmax activation function
        y_true: 1D np.array
        """
        predicts_clip = np.clip(self.softmax_probabilities, 1e-7, 1 - 1e-7) # prevent log(0)
        self.confidences = predicts_clip[range(len(predicts_clip)), self.y_true]
        self.loss = np.mean(-np.log(self.confidences))

    def forward(self, inputs: np.array, y_true: np.array) -> None:
        self.y_true_check(y_true)
        self.softmax(inputs)
        self.output_labels()
        self.calculate_accuracy()
        self.cross_entropy()

    def backprop(self):
        # y_pred is the probabilities from softmax activation function
        self.dvalues = self.softmax_probabilities.copy()
        self.dvalues[range(len(self.y_true)), self.y_true] -= 1
        self.dvalues = self.dvalues / len(self.y_true)

In [268]:
class Optimizer_SGD:
    """
    Stochastic gradient descent optimizer
    """

    def __init__(self, learning_rate: float = 1., decay: float = 0., momentum_ratio: float = 0.):
        self.init_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum_ratio = momentum_ratio

    def update_param(self, layer: Dense_layer):
        curr_learning_rate = self.init_learning_rate * (1. / ( 1. + self.decay * self.iterations))
        self.iterations += 1

        if self.momentum_ratio:
            if not hasattr(layer, "pre_w_momentums"):
                layer.pre_w_momentums = np.zeros_like(layer.weights)
                layer.pre_b_momentums = np.zeros_like(layer.biases)

            weight_updates = self.momentum_ratio * layer.pre_w_momentums - curr_learning_rate * layer.dweights
            bias_updates = self.momentum_ratio * layer.pre_b_momentums - curr_learning_rate * layer.dbiases
            # update the momentum in the layer
            layer.pre_w_momentums = weight_updates
            layer.pre_b_momentums = bias_updates
        else:
            weight_updates = -curr_learning_rate * layer.dweights
            bias_updates = -curr_learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates

In [287]:
class Optimizer_AdaGrad:
    """
    Adaptive gradient
    cache += gradient ** 2
    """

    def __init__(self, learning_rate: float=1., decay: float=1e-4, epsilon: float=1e-7):
        self.init_learning_rate = learning_rate
        self.epsilon = epsilon
        self.iterations = 0
        self.decay=decay

    def update_param(self, layer: Dense_layer):
        curr_learning_rate = self.init_learning_rate * (1. / ( 1. + self.decay * self.iterations))
        self.iterations += 1

        if not hasattr(layer, "w_cache"):
            layer.w_cache = np.zeros_like(layer.weights)
            layer.b_cache = np.zeros_like(layer.biases)
        
        layer.w_cache += layer.dweights ** 2
        layer.b_cache += layer.dbiases ** 2
        
        layer.weights -= curr_learning_rate * layer.dweights / \
                        (np.sqrt(layer.w_cache) + self.epsilon)
        layer.biases -= curr_learning_rate * layer.dbiases / \
                        (np.sqrt(layer.b_cache) + self.epsilon)

In [292]:
class Optimizer_RMSProp:
    """
    Root mean square propagation
    cache = rho * cache + (1 - rho) * gradient ** 2
    """

    def __init__(self, learning_rate: float=0.001, decay: float=1e-4, rho: float=0.9, epsilon: float=1e-7):
        self.init_learning_rate = learning_rate
        self.decay = decay
        self.rho = rho
        self.iterations = 0
        self.epsilon = epsilon

    def update_param(self, layer: Dense_layer) -> None:
        curr_learning_rate = self.init_learning_rate * (1. / ( 1. + self.decay * self.iterations))
        self.iterations += 1

        if not hasattr(layer, "w_cache"):
            layer.w_cache = np.zeros_like(layer.weights)
            layer.b_cache = np.zeros_like(layer.biases)
        
        layer.w_cache = self.rho * layer.w_cache + (1 - self.rho) * layer.dweights ** 2
        layer.b_cache = self.rho * layer.b_cache + (1 - self.rho) * layer.dbiases ** 2
        
        layer.weights -= curr_learning_rate * layer.dweights / \
                        (np.sqrt(layer.w_cache) + self.epsilon)
        layer.biases -= curr_learning_rate * layer.dbiases / \
                        (np.sqrt(layer.b_cache) + self.epsilon)

In [313]:
class Optimizer_Adam:
    """
    Adaptive momentum
    
    """

    def __init__(self, learning_rate: float=0.001, decay: float=0., 
                 epsilon: float=1e-7, beta_1: float=0.9, beta_2: float=0.999):
        self.init_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def update_param(self, layer: Dense_layer) -> None:
        curr_learning_rate = self.init_learning_rate * (1. / ( 1. + self.decay * self.iterations))
        self.iterations += 1

        if not hasattr(layer, "w_cache"):
            layer.w_cache = np.zeros_like(layer.weights)
            layer.w_momentum = np.zeros_like(layer.weights)
            layer.b_cache = np.zeros_like(layer.biases)
            layer.b_momentum = np.zeros_like(layer.biases)
        
        layer.w_momentum = self.beta_1 * layer.w_momentum + (1 - self.beta_1) * layer.dweights
        layer.b_momentum = self.beta_1 * layer.b_momentum + (1 - self.beta_1) * layer.dbiases
        
        w_momentum_corrected = layer.w_momentum / (1 - self.beta_1 ** self.iterations)
        b_momentum_corrected = layer.b_momentum / (1 - self.beta_1 ** self.iterations)

        layer.w_cache = self.beta_2 * layer.w_cache + (1 - self.beta_2) * layer.dweights ** 2
        layer.b_cache = self.beta_2 * layer.b_cache + (1 - self.beta_2) * layer.dbiases ** 2

        w_cache_corrected = layer.w_cache / (1 - self.beta_2 ** self.iterations)
        b_cache_corrected = layer.b_cache / (1 - self.beta_2 ** self.iterations)

        layer.weights -= curr_learning_rate * w_momentum_corrected / \
                        (np.sqrt(w_cache_corrected) + self.epsilon)
        layer.biases -= curr_learning_rate * b_momentum_corrected / \
                        (np.sqrt(b_cache_corrected) + self.epsilon)

In [264]:
import nnfs
from nnfs.datasets import spiral_data

In [321]:
nnfs.init()
X, y = spiral_data(samples = 100, classes = 3)

In [322]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_SGD(decay=1e-3, momentum_ratio=0.9)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.dinputs)
    dense1.backprop(activation1.dinputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.360, loss: 1.099
epoch: 1000, acc: 0.843, loss: 0.418
epoch: 2000, acc: 0.883, loss: 0.305
epoch: 3000, acc: 0.893, loss: 0.277
epoch: 4000, acc: 0.900, loss: 0.263
epoch: 5000, acc: 0.907, loss: 0.252
epoch: 6000, acc: 0.903, loss: 0.245
epoch: 7000, acc: 0.907, loss: 0.239
epoch: 8000, acc: 0.913, loss: 0.235
epoch: 9000, acc: 0.917, loss: 0.231
epoch: 10000, acc: 0.920, loss: 0.227


In [323]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_AdaGrad()

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.dinputs)
    dense1.backprop(activation1.dinputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.410, loss: 1.099
epoch: 1000, acc: 0.737, loss: 0.676
epoch: 2000, acc: 0.803, loss: 0.477
epoch: 3000, acc: 0.817, loss: 0.416
epoch: 4000, acc: 0.820, loss: 0.379
epoch: 5000, acc: 0.843, loss: 0.354
epoch: 6000, acc: 0.857, loss: 0.334
epoch: 7000, acc: 0.867, loss: 0.315
epoch: 8000, acc: 0.880, loss: 0.303
epoch: 9000, acc: 0.887, loss: 0.292
epoch: 10000, acc: 0.900, loss: 0.283


In [324]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_RMSProp(decay=1e-4)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.dinputs)
    dense1.backprop(activation1.dinputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.270, loss: 1.099
epoch: 1000, acc: 0.567, loss: 0.953
epoch: 2000, acc: 0.613, loss: 0.848
epoch: 3000, acc: 0.670, loss: 0.780
epoch: 4000, acc: 0.683, loss: 0.735
epoch: 5000, acc: 0.693, loss: 0.701
epoch: 6000, acc: 0.710, loss: 0.673
epoch: 7000, acc: 0.717, loss: 0.649
epoch: 8000, acc: 0.740, loss: 0.626
epoch: 9000, acc: 0.750, loss: 0.605
epoch: 10000, acc: 0.770, loss: 0.584


In [325]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_RMSProp(learning_rate = 0.02, decay = 1e-5, rho = 0.999)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.dinputs)
    dense1.backprop(activation1.dinputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.303, loss: 1.099
epoch: 1000, acc: 0.680, loss: 0.717
epoch: 2000, acc: 0.760, loss: 0.572
epoch: 3000, acc: 0.780, loss: 0.497
epoch: 4000, acc: 0.777, loss: 0.514
epoch: 5000, acc: 0.787, loss: 0.446
epoch: 6000, acc: 0.803, loss: 0.411
epoch: 7000, acc: 0.827, loss: 0.419
epoch: 8000, acc: 0.853, loss: 0.321
epoch: 9000, acc: 0.847, loss: 0.328
epoch: 10000, acc: 0.900, loss: 0.280


In [326]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.dinputs)
    dense1.backprop(activation1.dinputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.307, loss: 1.099
epoch: 1000, acc: 0.880, loss: 0.293
epoch: 2000, acc: 0.967, loss: 0.121
epoch: 3000, acc: 0.967, loss: 0.102
epoch: 4000, acc: 0.967, loss: 0.090
epoch: 5000, acc: 0.967, loss: 0.084
epoch: 6000, acc: 0.967, loss: 0.080
epoch: 7000, acc: 0.967, loss: 0.075
epoch: 8000, acc: 0.967, loss: 0.071
epoch: 9000, acc: 0.970, loss: 0.067
epoch: 10000, acc: 0.970, loss: 0.063
