Learn fundamental deep learning concepts by implementing a neural network using NumPy.

The forward and optimizers parts were from [Neural Networks from Scratch in Python](https://nnfs.io).

The backpropagation was from [Prof. Hung-yi Lee @ NTU](https://www.youtube.com/watch?v=ibJpTrp5mcE&list=PLJV_el3uVTsPy9oCRY30oBPNLCo89yu49&index=12) and CS 7643 Deep Learning @ GaTech

In [33]:
import numpy as np # NumPy is needed to speed up the computation (vectorization).

# Implement of Neural Network

In [34]:
class Dense_layer:
    """
    Class to represent a dense layer (fully-connected) in a neural network.
    Each node outputs wx + b in forward mode, where w represents the weights, 
    x represents the inputs, and b is the bias.
    """

    def __init__(self, n_inputs: int, n_nodes: int):
        """
        initialize weights and biases for the nodes in this neural network layer.
    
        params:
            n_inputs: number of neural nodes in the previous layer or number of input features.
            n_nodes: number of neural nodes in this layer
        """
        self.weights = 0.01 * np.random.randn(n_inputs, n_nodes)
        # np.random.randn() generates random numbers from a standard normal distribution, 
        # meaning the mean of the numbers in the weights matrix is 0, and the variance is 1. 
        # This is done because neural networks work best when values are between -1 and 1. 
        # Multiplying the weights by 0.01 speeds up training, as the weights are now closer 
        # in magnitude to the rate of updating (learning rate × gradients)."

        self.biases = np.zeros((1, n_nodes)) # one bias per node

    def forward(self, inputs: np.array) -> None:
        """
        performs a forward pass.
        The ouput of each node is sum(input * weight) + bias.

        params:
            inputs: the inputs are either data or outputs from the 
                    previous activation layer.
        """
        self.inputs = inputs # inputs were required for backprop
        self.forward_pass = np.dot(inputs, self.weights) + self.biases

    def backprop(self, dvalues: np.array) -> None:
        """
        perform a backpropgation.

        params: 
            dvalues: derivatives from the next hidden layer or from the loss of the model.
        """
        self.grad_w = np.dot(self.inputs.T, dvalues) # weight gradient
        self.grad_b = np.sum(dvalues, axis=0, keepdims=True) # bias gradient
        self.doutputs = np.dot(dvalues, self.weights.T) # dvalues for passing to the previous layer

![weight_gradient](weight_gradient.png)

This figure shows that the weight gradients are derived from chain rule:

$
weight \ gradient = \frac{\partial loss}{\partial w_1} = 
\frac{\partial loss}{\partial z}\frac{\partial z}{\partial w_i} = dvalues \ \times \ x_i
$

In [35]:
class Activation_ReLU:
    """
    Class to represent the ReLU (rectified linear unit) activation function for hidden layers. 
    """
    
    def forward(self, inputs: np.array) -> None:
        """
        converts the inputs to max(inputs, 0).

        params:
            inputs: the inputs are sum(input * weight) + bias)
        """
        self.inputs = inputs # inputs were required for backprop
        self.output = np.maximum(0, inputs)

    def backprop(self, dvalues: np.array) -> None:
        """
        perform a backpropgation.

        params: 
            dvalues: derivatives from the next hidden layer or output of the model.
        """
        self.doutputs = dvalues.copy()
        self.doutputs[self.inputs <= 0] = 0 # dvalues for passing to the previous layer

![weight_gradient](backprop_relu.png)

$ReLU = a = max(0, z)$

$
\frac{\partial ReLU}{\partial z} = \frac{\partial a}{\partial z} = 
    \begin{cases}
            1, &         \text{if } z > 0,\\
            0, &         \text{if } z \leq 0.
    \end{cases}
$

$\frac {\partial loss}{\partial z} = dvalues \times (1 $ if z > 0 else 0)

That equals converting negative numbers in the dvalues to 0.

In [36]:
class Softmax_loss:
    """
    Class to represent the Softmax activation function and cross entropy. 
    This combination is faster than calculating derivatives of cross entropy and softmax 
    separately during backpropgation.
    """
    
    def y_true_check(self, y_true: np.array) -> None:
        """
        make sure y_true is 1D array of labels (label encoding).

        params:
            y_true: the ground truth labels. It could be label encoding (n, ) or
                    one-hot encoding (n, number of categories). n is the size of samples.
        """
        if len(y_true.shape) == 2:
            self.y_true = np.argmax(y_true, axis=1)
        else:
            self.y_true = y_true

    def softmax(self, outputs: np.array) -> None:
        """
        converts outputs to a probability distribution.

        params:
            outputs: the outputs from the previous hidden layer.
        """

        outputs_exp = np.exp(outputs - np.max(outputs, axis=1, keepdims=True))
        # converts outputs to negative values to 0, preventing overflow
        # the results will not change due to normalization
        self.softmax_prob = outputs_exp / np.sum(outputs_exp, axis=1, keepdims=True)

    def output_labels(self) -> None:
        """
        find the label of the highest chance from the probability distribution. 
        """
        
        self.pred_labels = np.argmax(self.softmax_prob, axis=1)

    def calculate_accuracy(self) -> None:
        """
        compute accuracy of the prediction.
        accuracy = number of correct predictions / number of all predictions
        """

        self.accuracy = np.mean(self.pred_labels == self.y_true)

    def cross_entropy(self) -> None:
        """
        Loss = -sum(y_true x log(y_predict))
        This class simplifies the loss to -log(correct_class_condifence): Loss = -log(y_predict))

        y_pred: probabilities from softmax activation function
        y_true: 1D np.array
        """

        predicts_clip = np.clip(self.softmax_prob, 1e-7, 1-1e-7) # prevent log(0)
        self.confidences = predicts_clip[range(len(predicts_clip)), self.y_true]
        self.loss = np.mean(-np.log(self.confidences))

    def forward(self, outputs: np.array, y_true: np.array) -> None:
        """
        compute the forward pass results. This function does the following things:
            1. makes sure the gound truth label is in label encoding format
            2. converts the output from the previous hidden layer to probability distribution
            3. extracts the predicted labels from the probability distribution
            4. calculate the accuracy of the prediction based on the extracted predicted labels
            5. calculate the loss using cross entropy

        params:
            inputs: the outputs from the previous hidden layer.
            y_true: the ground truth labels. It could be label encoding (n, ) or
                    one-hot encoding (n, number of categories). n is the size of samples.
        """

        self.y_true_check(y_true) # make sure the gounrd truth label is label encoding
        self.softmax(outputs)
        self.output_labels()
        self.calculate_accuracy()
        self.cross_entropy()

    def backprop(self):
        """
        calculate the partial derivative of the Cross-Entropy loss with respect to 
        the Softmax function inputs. The result is y_predict - y_true.
        y_pred is the probabilities from softmax activation function.
        """

        self.dvalues = self.softmax_prob.copy()
        self.dvalues[range(len(self.y_true)), self.y_true] -= 1
        self.dvalues = self.dvalues / len(self.y_true) # normalization

In [37]:
class Optimizer_SGD:
    """
    Class to represent stochastic gradient descent (SGD) optimizer with tunable 
    learning rate, decaying and momentum.
    """

    def __init__(self, learning_rate: float = 1., decay: float = 0., momentum_ratio: float = 0.):
        """
        initialize an SGD optimizer. 
        
        The vanilla SGD updates the weights by the learning rate multiplied by the gradients.

        If the SGD optimizer is initialized with decay, the learning rate decreases exponentially, 
        following the formula:
            learning rate = initial_learning rate / (1 + decay * iterations)
            
        If the SGD optimizer is initialized with momentum, the weights will be updated by a 
        combination of the preceding gradients and the current gradients.

        params:
            learning rate: learning rate of this SGD optimizer
            decay: decaying rate of this SGD. A large decay value leads to a quick decrease 
                    in the learning rate. Decay should be equal to or larger than 0."
            momentum_ratio: momentum_ratio should be between 0 and 1. A large momentum_ratio means
                            the preceding gradients take a bigger role.
        """
        
        self.init_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum_ratio = momentum_ratio

    def update_param(self, layer: Dense_layer):
        """
        update the weights using the corresponding gradients.
        This function first computes the current learning rate based on the 
        number of iterations, then updates the weights.

        params:
            layer: a hidden neural network layer
        """

        curr_learning_rate = self.init_learning_rate * (1. / ( 1. + self.decay * self.iterations))
        self.iterations += 1

        if self.momentum_ratio:
            if not hasattr(layer, "pre_momentums_w"):
                layer.pre_momentums_w = np.zeros_like(layer.weights)
                layer.pre_momentums_b = np.zeros_like(layer.biases)

            weight_updates = self.momentum_ratio * layer.pre_momentums_w - curr_learning_rate * layer.grad_w
            bias_updates = self.momentum_ratio * layer.pre_momentums_b - curr_learning_rate * layer.grad_b
            # update the momentum in the layer for the next iteration
            layer.pre_momentums_w = weight_updates
            layer.pre_momentums_b = bias_updates
        else:
            weight_updates = -curr_learning_rate * layer.grad_w
            bias_updates = -curr_learning_rate * layer.grad_b

        layer.weights += weight_updates
        layer.biases += bias_updates

In [38]:
class Optimizer_AdaGrad:
    """
    Class to represent the Adaptive Gradient (AdaGrad) optimizer with a tunable 
    learning rate, decay, and epsilon. AdaGrad updates weights using per-parameter 
    learning rates rather than a global learning rate.

    AdaGrad normalizes the gradients using their previous gradient history. This 
    way, weights that changed significantly in the early iterations will change 
    less in the later iterations, and vice versa.
    """

    def __init__(self, learning_rate: float = 1., decay: float = 1e-4, epsilon: float = 1e-7):
        """
        initialize an AdaGrad optimizer. 

        The AdaGrad optimizer is initialized with decay, the learning rate decreases exponentially, 
        following the formula:
            learning rate = initial_learning rate / (1 + decay * iterations)
            
        The AdaGrad keeps the previous gradient history (history += gradient ** 2) and uses it to
        update the weights (weights -= learning_rate * gradient / (sqrt(history) + epsilon)).
        
        params:
            learning_rate: learning rate of this AdaGrad optimizer
            decay: decaying rate of this AdaGrad. A large decay value leads to a faster reduction in 
                    the learning rate. The decay should be equal to or larger than 0.
            epsilon: a small value to prevent division by 0.
        """

        self.init_learning_rate = learning_rate
        self.epsilon = epsilon
        self.iterations = 0
        self.decay = decay

    def update_param(self, layer: Dense_layer):
        """
        update the weights using the corresponding gradients.
        This function first computes the current learning rate based on the 
        number of iterations, then updates the weights.

        The gradients are normalized by the square root of their previous 
        gradients history.

        params:
            layer: a hidden neural network layer        
        """

        curr_learning_rate = self.init_learning_rate / (1. + self.decay * self.iterations)
        self.iterations += 1

        if not hasattr(layer, "grad_history_w"):
            layer.grad_history_w = np.zeros_like(layer.weights)
            layer.grad_history_b = np.zeros_like(layer.biases)
        
        layer.grad_history_w += layer.grad_w ** 2
        layer.grad_history_b += layer.grad_b ** 2
        
        layer.weights -= curr_learning_rate * layer.grad_w / (np.sqrt(layer.grad_history_w) + self.epsilon)
        layer.biases -= curr_learning_rate * layer.grad_b / (np.sqrt(layer.grad_history_b) + self.epsilon)

In [39]:
class Optimizer_RMSProp:
    """
    Class to represent the Root mean square propagation (RMSProp) optimizer with a tunable 
    learning rate, decay, rho, and epsilon. RMSProp updates weights using per-parameter 
    learning rates rather than a global learning rate.

    RMSProp normalizes the gradients using their previous gradient history and the current 
    gradient. This approach is similar to SGD with momentum and cache with the AdaGrad.
    """

    def __init__(self, learning_rate: float = 0.001, decay: float = 1e-4, 
                 rho: float = 0.9, epsilon: float = 1e-7):
        """
        initialize an RMSProp optimizer. 

        The RMSProp optimizer is initialized with decay, the learning rate decreases exponentially, 
        following the formula:
            learning rate = initial_learning rate / (1 + decay * iterations)
            
        The RMSProp keeps a moving average of the gradient history by combining the previous gradient 
        history and the current gradient (history = rho * history + (1 - rho) * gradient ** 2). Then 
        the RMSProp optimizer uses it to update the weights 
        (weights -= learning_rate * gradient / (sqrt(history) + epsilon))
        
        params:
            learning_rate: learning rate of this RMSProp optimizer
            decay: decaying rate of this RMSProp. A large decay value leads to a faster reduction in 
                    the learning rate. The decay should be equal to or larger than 0. RMSProp typically 
                    requires a smaller initial learning rate because it applies momentum to the gradient 
                    history, causing the gradient history to increase more slowly compared to AdaGrad.
            rho: cache memory (history) decay rate. A higher rho value means slower history decaying.
            epsilon: a small value to prevent division by 0.
        """

        self.init_learning_rate = learning_rate
        self.decay = decay
        self.rho = rho
        self.iterations = 0
        self.epsilon = epsilon

    def update_param(self, layer: Dense_layer) -> None:
        """
        update the weights using the corresponding gradients.
        This function first computes the current learning rate based on the 
        number of iterations, then updates the weights.

        The gradients are normalized by the square root of the moving average of the gradients.

        params:
            layer: a hidden neural network layer        
        """

        curr_learning_rate = self.init_learning_rate / (1. + self.decay * self.iterations)
        self.iterations += 1

        if not hasattr(layer, "grad_history_w"):
            layer.grad_history_w = np.zeros_like(layer.weights)
            layer.grad_history_b = np.zeros_like(layer.biases)
        
        layer.grad_history_w = self.rho * layer.grad_history_w + (1 - self.rho) * layer.grad_w ** 2
        layer.grad_history_b = self.rho * layer.grad_history_b + (1 - self.rho) * layer.grad_b ** 2
        
        layer.weights -= curr_learning_rate * layer.grad_w / (np.sqrt(layer.grad_history_w) + self.epsilon)
        layer.biases -= curr_learning_rate * layer.grad_b / (np.sqrt(layer.grad_history_b) + self.epsilon)

In [40]:
class Optimizer_Adam:
    """
    Class to represent the Adaptive Momentum (Adam) optimizer with a tunable 
    learning rate, decay, epsilon, beta_1, and beta_2. Adam optimizer is like 
    a combination of SGD with momentum and RMSProp.

    The Adam optimizer has a bias correction mechanism to speed up the training 
    in the initial stages.
    """


    def __init__(self, learning_rate: float = 0.001, decay: float = 0., 
                 epsilon: float = 1e-7, beta_1: float = 0.9, beta_2: float = 0.999):
        """
        initialize an Adam optimizer. 

        If the Adam optimizer is initialized with decay, the learning rate decreases exponentially, 
        following the formula:
            learning rate = initial_learning rate / (1 + decay * iterations)
        
        The Adam optimizer uses momentum like SGD with momentum
            momentum = beta_1 * momentum + (1 - beta_1) * grad_w
        and then corrects the momentum to speed up the training at early stages
            momentum_corrected = momentum / (1 - beta_1 ** iterations)

        The Adam optimizer also uses gradient history like RMSProp
            gradient_history = beta_2 * gradient_history + (1 - beta_2) * grad_w ** 2
        and then corrects the gradient_history to speed up the training at early stages
            gradient_history_corrected = gradient_history / (1 - beta_2 ** iterations)
        
        params:
            learning_rate: learning rate of this Adam optimizer
            decay: decaying rate of this Adam. A large decay value leads to a faster reduction in 
                    the learning rate. The decay should be equal to or larger than 0.
            epsilon: a small value to prevent division by 0.
            beta_1: momentum ratio and momentum correction.
            beta_2: cache memory (history) decay rate and cache memory (history) correction.
        """

        self.init_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def update_param(self, layer: Dense_layer) -> None:
        """
        update the weights using the corresponding gradients.
        This function first computes the current learning rate based on the 
        number of iterations, then updates the weights.

        params:
            layer: a hidden neural network layer      
        """

        curr_learning_rate = self.init_learning_rate / ( 1. + self.decay * self.iterations)
        self.iterations += 1

        if not hasattr(layer, "grad_history_w"):
            layer.grad_history_w = np.zeros_like(layer.weights)
            layer.momentum_w = np.zeros_like(layer.weights)
            layer.grad_history_b = np.zeros_like(layer.biases)
            layer.momentum_b = np.zeros_like(layer.biases)
        
        layer.momentum_w = self.beta_1 * layer.momentum_w + (1 - self.beta_1) * layer.grad_w
        layer.momentum_b = self.beta_1 * layer.momentum_b + (1 - self.beta_1) * layer.grad_b
        
        momentum_w_corrected = layer.momentum_w / (1 - self.beta_1 ** self.iterations)
        momentum_b_corrected = layer.momentum_b / (1 - self.beta_1 ** self.iterations)

        layer.grad_history_w = self.beta_2 * layer.grad_history_w + (1 - self.beta_2) * layer.grad_w ** 2
        layer.grad_history_b = self.beta_2 * layer.grad_history_b + (1 - self.beta_2) * layer.grad_b ** 2

        grad_history_w_corrected = layer.grad_history_w / (1 - self.beta_2 ** self.iterations)
        grad_history_b_corrected = layer.grad_history_b / (1 - self.beta_2 ** self.iterations)

        layer.weights -= curr_learning_rate * momentum_w_corrected / (np.sqrt(grad_history_w_corrected) + self.epsilon)
        layer.biases -= curr_learning_rate * momentum_b_corrected / (np.sqrt(grad_history_b_corrected) + self.epsilon)

In [41]:
import nnfs
from nnfs.datasets import spiral_data

In [42]:
nnfs.init()
X, y = spiral_data(samples = 100, classes = 3)

In [43]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_SGD(decay=1e-3, momentum_ratio=0.9)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.doutputs)
    dense1.backprop(activation1.doutputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.360, loss: 1.099
epoch: 1000, acc: 0.843, loss: 0.418
epoch: 2000, acc: 0.883, loss: 0.305
epoch: 3000, acc: 0.893, loss: 0.277
epoch: 4000, acc: 0.900, loss: 0.263
epoch: 5000, acc: 0.907, loss: 0.252
epoch: 6000, acc: 0.903, loss: 0.245
epoch: 7000, acc: 0.907, loss: 0.239
epoch: 8000, acc: 0.913, loss: 0.235
epoch: 9000, acc: 0.917, loss: 0.231
epoch: 10000, acc: 0.920, loss: 0.227


In [44]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_AdaGrad()

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.doutputs)
    dense1.backprop(activation1.doutputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.410, loss: 1.099
epoch: 1000, acc: 0.737, loss: 0.676
epoch: 2000, acc: 0.803, loss: 0.477
epoch: 3000, acc: 0.817, loss: 0.416
epoch: 4000, acc: 0.820, loss: 0.379
epoch: 5000, acc: 0.843, loss: 0.354
epoch: 6000, acc: 0.857, loss: 0.334
epoch: 7000, acc: 0.867, loss: 0.315
epoch: 8000, acc: 0.880, loss: 0.303
epoch: 9000, acc: 0.887, loss: 0.292
epoch: 10000, acc: 0.900, loss: 0.283


In [45]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_RMSProp(decay=1e-4)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.doutputs)
    dense1.backprop(activation1.doutputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.270, loss: 1.099
epoch: 1000, acc: 0.577, loss: 0.946
epoch: 2000, acc: 0.637, loss: 0.818
epoch: 3000, acc: 0.667, loss: 0.755
epoch: 4000, acc: 0.680, loss: 0.707
epoch: 5000, acc: 0.727, loss: 0.667
epoch: 6000, acc: 0.750, loss: 0.627
epoch: 7000, acc: 0.780, loss: 0.586
epoch: 8000, acc: 0.800, loss: 0.549
epoch: 9000, acc: 0.810, loss: 0.520
epoch: 10000, acc: 0.813, loss: 0.495


In [46]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_RMSProp(learning_rate = 0.02, decay = 1e-5, rho = 0.999)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.doutputs)
    dense1.backprop(activation1.doutputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.303, loss: 1.099
epoch: 1000, acc: 0.747, loss: 0.630
epoch: 2000, acc: 0.763, loss: 0.532
epoch: 3000, acc: 0.773, loss: 0.488
epoch: 4000, acc: 0.807, loss: 0.449
epoch: 5000, acc: 0.820, loss: 0.423
epoch: 6000, acc: 0.840, loss: 0.372
epoch: 7000, acc: 0.863, loss: 0.348
epoch: 8000, acc: 0.833, loss: 0.364
epoch: 9000, acc: 0.870, loss: 0.314
epoch: 10000, acc: 0.880, loss: 0.280


In [47]:
dense1 = Dense_layer(2 , 64)
activation1 = Activation_ReLU()
dense2 = Dense_layer(64 , 3)
output = Softmax_loss()
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

for epoch in range (10001):
    # Perform a forward pass
    dense1.forward(X)
    activation1.forward(dense1.forward_pass)
    dense2.forward(activation1.output)
    loss = output.forward(dense2.forward_pass, y)
    # Perform a backpropgation
    output.backprop()
    dense2.backprop(output.dvalues)
    activation1.backprop(dense2.doutputs)
    dense1.backprop(activation1.doutputs)
    # update param
    optimizer.update_param(dense1)
    optimizer.update_param(dense2)

    if epoch % 1000 == 0:
        print(f"epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")

    # if output.accuracy >= 0.9:
    #     print(f"early terminate- epoch: {epoch}, acc: {output.accuracy :.3f}, loss: {output.loss :.3f}")
    #     break

epoch: 0, acc: 0.307, loss: 1.099
epoch: 1000, acc: 0.880, loss: 0.293
epoch: 2000, acc: 0.967, loss: 0.121
epoch: 3000, acc: 0.967, loss: 0.102
epoch: 4000, acc: 0.967, loss: 0.090
epoch: 5000, acc: 0.967, loss: 0.084
epoch: 6000, acc: 0.967, loss: 0.080
epoch: 7000, acc: 0.967, loss: 0.075
epoch: 8000, acc: 0.967, loss: 0.071
epoch: 9000, acc: 0.970, loss: 0.067
epoch: 10000, acc: 0.970, loss: 0.063
