In [1]:
import numpy as np
import matplotlib.pyplot
import nnfs
from nnfs.datasets import spiral_data
from NNS import NeuralNetwork as NN #import neural net code from github to reduce copy/pasting

# Chapter 10

## Optimizers

### Stochastic Gradient Descent (SGD)

In [None]:
class Optimizer_SGD:
    
    # Initialize optimizer - set settings,
    # learning rate of 1.0 is set as a defaults
    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate
    
    # Update Parameters
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases


#One pass
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_SGD()

dense1.forward(X)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output,y)

print('loss: ', loss)

# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis = 1 )
if len (y.shape) == 2 :
    y = np.argmax(y, axis = 1 )
accuracy = np.mean(predictions == y)
print( 'acc:' , accuracy)

#Backward Pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

#Update weights and biases
optimizer.update_params(dense1)
optimizer.update_params(dense2)

loss:  1.0986249219226374
acc: 0.3333333333333333


In [11]:
#Multiple passes
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_SGD()

for epoch in range(10001):
    
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output,y)
    
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    
    #Print every 100th epoch
    if not epoch % 100 :
        print (f'epoch: {epoch} , ' +
               f'acc: {accuracy :.3f}, ' +
               f'loss: {loss :.3f}' )
        
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    
    


epoch: 0 , acc: 0.370, loss: 1.099
epoch: 100 , acc: 0.457, loss: 1.075
epoch: 200 , acc: 0.457, loss: 1.066
epoch: 300 , acc: 0.440, loss: 1.062
epoch: 400 , acc: 0.453, loss: 1.060
epoch: 500 , acc: 0.463, loss: 1.057
epoch: 600 , acc: 0.467, loss: 1.054
epoch: 700 , acc: 0.450, loss: 1.050
epoch: 800 , acc: 0.410, loss: 1.054
epoch: 900 , acc: 0.397, loss: 1.051
epoch: 1000 , acc: 0.393, loss: 1.046
epoch: 1100 , acc: 0.400, loss: 1.051
epoch: 1200 , acc: 0.393, loss: 1.069
epoch: 1300 , acc: 0.400, loss: 1.062
epoch: 1400 , acc: 0.410, loss: 1.063
epoch: 1500 , acc: 0.400, loss: 1.054
epoch: 1600 , acc: 0.407, loss: 1.058
epoch: 1700 , acc: 0.417, loss: 1.039
epoch: 1800 , acc: 0.410, loss: 1.041
epoch: 1900 , acc: 0.413, loss: 1.051
epoch: 2000 , acc: 0.400, loss: 1.072
epoch: 2100 , acc: 0.397, loss: 1.072
epoch: 2200 , acc: 0.417, loss: 1.049
epoch: 2300 , acc: 0.420, loss: 1.005
epoch: 2400 , acc: 0.440, loss: 0.999
epoch: 2500 , acc: 0.460, loss: 0.984
epoch: 2600 , acc: 0.450

### Learning Rate Decay

In [30]:
# SGD optimizer
class Optimizer_SGD :
    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__ ( self , learning_rate = 1. , decay = 0. ):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        
    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate * ( 1. / ( 1. + self.decay * self.iterations))
    
    # Update parameters
    def update_params ( self , layer ):
        layer.weights += - self.current_learning_rate * layer.dweights
        layer.biases += - self.current_learning_rate * layer.dbiases
    
    # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1
        

#Multiple passes
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_SGD(decay=1e-2)

for epoch in range(10001):
    
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output,y)
    
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    
    #Print every 100th epoch
    if not epoch % 100 :
        print (f'epoch: {epoch} , ' +
               f'acc: {accuracy :.3f}, ' +
               f'loss: {loss :.3f}' +
               f'lr: {optimizer.current_learning_rate}')
        
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()
    


epoch: 0 , acc: 0.290, loss: 1.099lr: 1.0
epoch: 100 , acc: 0.440, loss: 1.090lr: 0.5025125628140703
epoch: 200 , acc: 0.463, loss: 1.073lr: 0.33444816053511706
epoch: 300 , acc: 0.440, loss: 1.069lr: 0.2506265664160401
epoch: 400 , acc: 0.440, loss: 1.067lr: 0.2004008016032064
epoch: 500 , acc: 0.440, loss: 1.066lr: 0.1669449081803005
epoch: 600 , acc: 0.443, loss: 1.066lr: 0.14306151645207438
epoch: 700 , acc: 0.443, loss: 1.065lr: 0.1251564455569462
epoch: 800 , acc: 0.440, loss: 1.065lr: 0.11123470522803114
epoch: 900 , acc: 0.443, loss: 1.065lr: 0.10010010010010009
epoch: 1000 , acc: 0.440, loss: 1.065lr: 0.09099181073703366
epoch: 1100 , acc: 0.437, loss: 1.065lr: 0.08340283569641367
epoch: 1200 , acc: 0.437, loss: 1.065lr: 0.07698229407236336
epoch: 1300 , acc: 0.440, loss: 1.065lr: 0.07147962830593281
epoch: 1400 , acc: 0.443, loss: 1.065lr: 0.066711140760507
epoch: 1500 , acc: 0.443, loss: 1.064lr: 0.06253908692933083
epoch: 1600 , acc: 0.443, loss: 1.064lr: 0.0588581518540317

### Stochastic Gradient Descent with Momentum

In [29]:
class Optimizer_SGD :
    
    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__ ( self , learning_rate = 1. , decay = 0. , momentum = 0. ):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate * ( 1. / ( 1. + self.decay * self.iterations))
            
    # Update parameters
    def update_params ( self , layer ):
        
        # If we use momentum
        if self.momentum:
            # If layer does not contain momentum arrays, create them filled with zeros
            if not hasattr (layer, 'weight_momentums' ):
                layer.weight_momentums = np.zeros_like(layer.weights)
                # If there is no momentum array for weights
                # The array doesn't exist for biases yet either.
                layer.bias_momentums = np.zeros_like(layer.biases)
                
            # Build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with
            # current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            # Build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
            
        # Vanilla SGD updates (as before momentum update)
        else :
            weight_updates = - self.current_learning_rate * layer.dweights
            bias_updates = - self.current_learning_rate * layer.dbiases
            
        # Update weights and biases using either
        # vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1
        
        
#Multiple passes
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_SGD(decay=1e-3, momentum=0.9)

for epoch in range(10001):
    
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output,y)
    
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    
    #Print every 100th epoch
    if not epoch % 100 :
        print (f'epoch: {epoch} , ' +
               f'acc: {accuracy :.3f}, ' +
               f'loss: {loss :.3f}' +
               f'lr: {optimizer.current_learning_rate}')
        
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0 , acc: 0.377, loss: 1.099lr: 1.0
epoch: 100 , acc: 0.427, loss: 1.059lr: 0.9099181073703367
epoch: 200 , acc: 0.473, loss: 0.951lr: 0.8340283569641367
epoch: 300 , acc: 0.687, loss: 0.750lr: 0.7698229407236336
epoch: 400 , acc: 0.780, loss: 0.556lr: 0.7147962830593281
epoch: 500 , acc: 0.863, loss: 0.372lr: 0.66711140760507
epoch: 600 , acc: 0.887, loss: 0.307lr: 0.6253908692933083
epoch: 700 , acc: 0.917, loss: 0.247lr: 0.5885815185403178
epoch: 800 , acc: 0.923, loss: 0.217lr: 0.5558643690939411
epoch: 900 , acc: 0.923, loss: 0.187lr: 0.526592943654555
epoch: 1000 , acc: 0.930, loss: 0.175lr: 0.5002501250625312
epoch: 1100 , acc: 0.940, loss: 0.158lr: 0.4764173415912339
epoch: 1200 , acc: 0.950, loss: 0.147lr: 0.45475216007276037
epoch: 1300 , acc: 0.950, loss: 0.141lr: 0.43497172683775553
epoch: 1400 , acc: 0.957, loss: 0.134lr: 0.4168403501458941
epoch: 1500 , acc: 0.957, loss: 0.129lr: 0.4001600640256102
epoch: 1600 , acc: 0.963, loss: 0.125lr: 0.3847633705271258
epoch: 1

### AdaGrad

Adaptive gradient: uses a per-parameter learning rate rather than a globally-shared rate

In [28]:
class Optimizer_Adagrad :
    
    # Initialize optimizer - set settings
    def __init__ ( self , learning_rate = 1. , decay = 0. , epsilon = 1e-7 ):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate * ( 1. / ( 1. + self.decay * self.iterations))
 
    # Update parameters
    def update_params ( self , layer ):
        
        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr (layer, 'weight_cache' ):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        
        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2
        
        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += - self.current_learning_rate * \
                layer.dweights / \
                (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += - self.current_learning_rate * \
            layer.dbiases / \
                (np.sqrt(layer.bias_cache) + self.epsilon)
                
    # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1
        
        
 
#Multiple passes
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adagrad( decay = 1e-4 )

for epoch in range(10001):
    
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output,y)
    
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    
    #Print every 100th epoch
    if not epoch % 100 :
        print (f'epoch: {epoch} , ' +
               f'acc: {accuracy :.3f}, ' +
               f'loss: {loss :.3f}' +
               f'lr: {optimizer.current_learning_rate}')
        
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0 , acc: 0.383, loss: 1.099lr: 1.0
epoch: 100 , acc: 0.467, loss: 1.012lr: 0.9901970492127933
epoch: 200 , acc: 0.517, loss: 0.962lr: 0.9804882831650161
epoch: 300 , acc: 0.473, loss: 0.959lr: 0.9709680551509855
epoch: 400 , acc: 0.580, loss: 0.863lr: 0.9616309260505818
epoch: 500 , acc: 0.600, loss: 0.811lr: 0.9524716639679969
epoch: 600 , acc: 0.610, loss: 0.780lr: 0.9434852344560807
epoch: 700 , acc: 0.647, loss: 0.734lr: 0.9346667912889054
epoch: 800 , acc: 0.673, loss: 0.708lr: 0.9260116677470135
epoch: 900 , acc: 0.687, loss: 0.679lr: 0.9175153683824203
epoch: 1000 , acc: 0.680, loss: 0.649lr: 0.9091735612328392
epoch: 1100 , acc: 0.690, loss: 0.631lr: 0.9009820704567978
epoch: 1200 , acc: 0.697, loss: 0.606lr: 0.892936869363336
epoch: 1300 , acc: 0.697, loss: 0.603lr: 0.8850340738118416
epoch: 1400 , acc: 0.730, loss: 0.570lr: 0.8772699359592947
epoch: 1500 , acc: 0.733, loss: 0.559lr: 0.8696408383337683
epoch: 1600 , acc: 0.750, loss: 0.539lr: 0.8621432882145013
epoch: 1

### RMSProp

Root Mean Squared Propagation: another adaptive learning rate

Adds momentum for a smoother learning rate

In [27]:
# RMSprop optimizer
class Optimizer_RMSprop :
    
    # Initialize optimizer - set settings
    def __init__ ( self , learning_rate = 0.001 , decay = 0. , epsilon = 1e-7 , rho = 0.9 ):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        
    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate * ( 1. / ( 1. + self.decay * self.iterations))
    
    # Update parameters
    def update_params ( self , layer ):
        
        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr (layer, 'weight_cache' ):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # Update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + ( 1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + ( 1 - self.rho) * layer.dbiases ** 2

        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += - self.current_learning_rate * layer.dweights / \
            (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += - self.current_learning_rate * \
            layer.dbiases / \
                (np.sqrt(layer.bias_cache) + self.epsilon)
                
    # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1
        

#Multiple passes
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_RMSprop(learning_rate=0.02, decay = 1e-5, rho=0.999 )

for epoch in range(10001):
    
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output,y)
    
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    
    #Print every 100th epoch
    if not epoch % 100 :
        print (f'epoch: {epoch} , ' +
               f'acc: {accuracy :.3f}, ' +
               f'loss: {loss :.3f}' +
               f'lr: {optimizer.current_learning_rate}')
        
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0 , acc: 0.323, loss: 1.099lr: 0.02
epoch: 100 , acc: 0.447, loss: 1.045lr: 0.01998021958261321
epoch: 200 , acc: 0.473, loss: 1.005lr: 0.019960279044701046
epoch: 300 , acc: 0.450, loss: 0.983lr: 0.019940378268975763
epoch: 400 , acc: 0.503, loss: 0.958lr: 0.01992051713662487
epoch: 500 , acc: 0.483, loss: 0.970lr: 0.01990069552930875
epoch: 600 , acc: 0.527, loss: 0.906lr: 0.019880913329158343
epoch: 700 , acc: 0.530, loss: 0.878lr: 0.019861170418772778
epoch: 800 , acc: 0.547, loss: 0.891lr: 0.019841466681217078
epoch: 900 , acc: 0.557, loss: 0.840lr: 0.01982180200001982
epoch: 1000 , acc: 0.613, loss: 0.815lr: 0.019802176259170884
epoch: 1100 , acc: 0.617, loss: 0.814lr: 0.01978258934311912
epoch: 1200 , acc: 0.607, loss: 0.791lr: 0.01976304113677013
epoch: 1300 , acc: 0.593, loss: 0.780lr: 0.019743531525483964
epoch: 1400 , acc: 0.610, loss: 0.760lr: 0.01972406039507293
epoch: 1500 , acc: 0.597, loss: 0.750lr: 0.019704627631799327
epoch: 1600 , acc: 0.667, loss: 0.721lr: 0.

### Adam

Adaptive Momentum: Most widly used optimiser, built atop RMSProp

In [26]:
class Optimizer_Adam :
    
    # Initialize optimizer - set settings
    def __init__ ( self , learning_rate = 0.001 , decay = 0. , epsilon = 1e-7 , beta_1 = 0.9 , beta_2 = 0.999 ):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate * ( 1. / ( 1. + self.decay * self.iterations))
    
    # Update parameters
    def update_params ( self , layer ):
        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr (layer, 'weight_cache' ):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # Update momentum with current gradients
        layer.weight_momentums = self.beta_1 * \
            layer.weight_momentums + \
                ( 1 - self.beta_1) * layer.dweights
                
        layer.bias_momentums = self.beta_1 * \
            layer.bias_momentums + \
                ( 1 - self.beta_1) * layer.dbiases
                
        # Get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start with 1 here
        weight_momentums_corrected = layer.weight_momentums / \
            ( 1 - self.beta_1 ** (self.iterations + 1 ))
            
        bias_momentums_corrected = layer.bias_momentums / \
            ( 1 - self.beta_1 ** (self.iterations + 1 ))
            
        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            ( 1 - self.beta_2) * layer.dweights ** 2
            
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            ( 1 - self.beta_2) * layer.dbiases ** 2
            
        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / \
            ( 1 - self.beta_2 ** (self.iterations + 1 ))
            
        bias_cache_corrected = layer.bias_cache / \
            ( 1 - self.beta_2 ** (self.iterations + 1 ))
            
        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += - self.current_learning_rate * \
            weight_momentums_corrected / \
                (np.sqrt(weight_cache_corrected) + self.epsilon)
                
        layer.biases += - self.current_learning_rate * \
            bias_momentums_corrected / \
                (np.sqrt(bias_cache_corrected) + self.epsilon)
                
                
    # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1
        
        

#Multiple passes
X, y = spiral_data(samples=100, classes=3)

dense1 = NN.Layer_Dense(2,64)
activation1 = NN.Activation_ReLU()
dense2 = NN.Layer_Dense(64,3)
loss_activation = NN.Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adam(learning_rate=0.05, decay = 5e-7)

for epoch in range(10001):
    
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output,y)
    
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    
    #Print every 100th epoch
    if not epoch % 100 :
        print (f'epoch: {epoch} , ' +
               f'acc: {accuracy :.3f}, ' +
               f'loss: {loss :.3f}' +
               f'lr: {optimizer.current_learning_rate}')
        
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0 , acc: 0.377, loss: 1.099lr: 0.05
epoch: 100 , acc: 0.733, loss: 0.664lr: 0.04999752512250644
epoch: 200 , acc: 0.787, loss: 0.506lr: 0.04999502549496326
epoch: 300 , acc: 0.803, loss: 0.438lr: 0.049992526117345455
epoch: 400 , acc: 0.810, loss: 0.409lr: 0.04999002698961558
epoch: 500 , acc: 0.840, loss: 0.377lr: 0.049987528111736124
epoch: 600 , acc: 0.863, loss: 0.356lr: 0.049985029483669646
epoch: 700 , acc: 0.850, loss: 0.345lr: 0.049982531105378675
epoch: 800 , acc: 0.897, loss: 0.292lr: 0.04998003297682575
epoch: 900 , acc: 0.903, loss: 0.260lr: 0.049977535097973466
epoch: 1000 , acc: 0.897, loss: 0.250lr: 0.049975037468784345
epoch: 1100 , acc: 0.917, loss: 0.236lr: 0.049972540089220974
epoch: 1200 , acc: 0.917, loss: 0.220lr: 0.04997004295924593
epoch: 1300 , acc: 0.920, loss: 0.209lr: 0.04996754607882181
epoch: 1400 , acc: 0.927, loss: 0.206lr: 0.049965049447911185
epoch: 1500 , acc: 0.913, loss: 0.197lr: 0.04996255306647668
epoch: 1600 , acc: 0.930, loss: 0.185lr: 0.