<a href="https://colab.research.google.com/github/chosh84/ml_study/blob/main/NNFS/notebooks/nnfs_modelobject_category.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nnfs



In [None]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

In [None]:
# neuron class 생성
class Layer_Dense:

  # Layer initialization
  def __init__(self, n_inputs, n_neurons,
               weight_regularizer_l1=0, weight_regularizer_l2=0,
               bias_regularizer_l1=0, bias_regularizer_l2=0):
    #Initialize weights and biases
    self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
    self.biases = np.zeros((1, n_neurons))
    # Set regularization strength
    self.weight_regularizer_l1 = weight_regularizer_l1
    self.weight_regularizer_l2 = weight_regularizer_l2
    self.bias_regularizer_l1 = bias_regularizer_l1
    self.bias_regularizer_l2 = bias_regularizer_l2

  def forward(self, inputs, training):
    # Remembering inputs for calculating backward pass of weights
    self.inputs = inputs
    # Calculate output values from inputs, weights and biases
    self.output = np.dot(inputs, self.weights) + self.biases
  
  #backward pass
  def backward(self, dvalues):
    # Gradients on parameters
    self.dweights = np.dot(self.inputs.T, dvalues)
    self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
    
    # Gradients on regularization
    # L1 on weights
    if self.weight_regularizer_l1 > 0:
      dL1 = np.ones_like(self.weights)
      dL1[self.weights < 0] = -1
      self.dweights += self.weight_regularizer_l1 * dL1
    
    # L1 on biases
    if self.bias_regularizer_l1 > 0:
      dL1 = np.ones_like(self.biases)
      dL1[self.biases < 0] = -1
      self.dbiases += self.bias_regularizer_l1 * dL1

    # L2 on weights
    if self.weight_regularizer_l2 > 0:
      self.dweights += 2 * self.weight_regularizer_l2 * self.weights
    
    # L2 on biases
    if self.bias_regularizer_l2 > 0:
      self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
    
    # Gradients on values
    self.dinputs = np.dot(dvalues, self.weights.T)

In [None]:
# Input "layer"
class Layer_Input:
  # Forward pass
  def forward(self, inputs, training): 
    self.output = inputs

In [None]:
# Dropout
class Layer_Dropout:
  # Init
  def __init__(self, rate):
    # Store rate, we invert it as for example for dropout # of 0.1 we need success rate of 0.9
    self.rate = 1 - rate
  
  # Forward pass
  def forward(self, inputs, training):
    # Save input values
    self.inputs = inputs

    # If not in training mode - return values
    if not training:
      self.output = inputs.copy()
      return
    
    # Generate and save scaled mask
    self.binary_mask = np.random.binomial(1, self.rate, size=inputs.shape) / self.rate
    # Apply mask to output values
    self.output = inputs * self.binary_mask
  
  # Backward pass
  def backward(self, dvalues):
    # Gradient on values
    self.dinputs = dvalues * self.binary_mask

In [None]:
# ReLU Activiation class 생성
class Activation_ReLU:
  #Forward pass
  def forward(self, inputs, training):
    # Remembering inputs for calculating backward pass of weights
    self.inputs = inputs
    # Calculate output values from input
    self.output = np.maximum(0, inputs)

  # Backward pass
  def backward(self, dvalues):
    # Since we need to modify the original variables,
    # let's make a copy of the values first
    self.dinputs = dvalues.copy()

    # Zero gradient where input values are negetive
    self.dinputs[self.inputs <= 0] = 0
  
  # Calculate predictions for outputs
  def predictions(self, outputs):
    return outputs

In [None]:
# Softmax Activation class 생성
# output 간의 확율로 변환 (총합이 1인 확율로 변환)
class Activation_Softmax:
  # Forward pass
  def forward(self, inputs, training):
    # Get unnormalized probabilities
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

    # Normalize them for each sample
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

    self.output = probabilities

  # Backward pass
  def backward(self, dvalues):
    # Create uninitialized array
    self.dinputs = np.empty_like(dvalues)
    # Enumerate outputs and gradients
    for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
      # Flatten output array
      single_output = single_output.reshape(-1, 1)
      # Calculate Jacobian matrix of the output
      jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
      # Calculate sample-wise gradient
      # and add it to the array of sample gradients
      self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

  # Calculate predictions for outputs
  def predictions(self, outputs):
    return np.argmax(outputs, axis=1)


In [None]:
# Sigmoid activation
class Activation_Sigmoid:
  # Forward pass
  def forward(self, inputs, training):
    # Save input and calculate/save output
    # of the sigmoid function
    self.inputs = inputs
    self.output = 1 / (1 + np.exp(-inputs))

  # Backward pass
  def backward(self, dvalues):
    # Derivatives - calculates from output of sigmoid function
    self.dinputs = dvalues * (1 - self.output) * self.output
  
  # Calculate predictions for outputs
  def predictions(self, outputs):
    return (outputs > 0.5) * 1

In [None]:
# Linear Activation
class Activation_Linear:
  # Forward pass
  def forward(self, inputs, training):
    # Just rememer the values
    self.inputs = inputs
    self.output = inputs

  # Backward pass
  def backward(self, dvalues):
    # derivative is 1, 1 * dvalues = dvalues - the chain rule
    self.dinputs = dvalues.copy()

  # Calculate predictions for outputs
  def predictions(self, outputs):
    return outputs

In [None]:
# Common loss class
class Loss:                                       
  # Calulates the data and regularization losses
  # given model output and ground truth values
  def calculate(self, output, y, *, include_regularization=False):

    # Calculate sample losses
    sample_losses = self.forward(output, y)

    # Calculate mean loss
    data_loss = np.mean(sample_losses)

    # If just data loss - return it
    if not include_regularization:
      return data_loss

    # Return loss
    return data_loss, self.regularization_loss()

  # Set/remember trainable layers
  def remember_trainable_layers(self, trainable_layers): 
    self.trainable_layers = trainable_layers

  # Regularization loss calculation
  def regularization_loss(self): 
    # 0 by default
    regularization_loss = 0
    
    # Calculate regularization loss
    # iterate all trainable layers
    for layer in self.trainable_layers:
      # L1 regularization - weights
      # calculate only when factor greater than 0
      if layer.weight_regularizer_l1 > 0:
        regularization_loss += layer.weight_regularizer_l1 * \
                                np.sum(np.abs(layer.weights))
      # L2 regularization - weights
      if layer.weight_regularizer_l2 > 0:
        regularization_loss += layer.weight_regularizer_l2 * \
                                np.sum(layer.weights *
                                       layer.weights)
      # L1 regularization - biases
      # only calculate when factor greater than 0
      if layer.bias_regularizer_l1 > 0:
        regularization_loss += layer.bias_regularizer_l1 * \
                               np.sum(np.abs(layer.biases))
      # L2 regularization - biases
      if layer.bias_regularizer_l2 > 0:
        regularization_loss += layer.bias_regularizer_l2 * \
                                np.sum(layer.biases *
                                        layer.biases)

    return regularization_loss

In [None]:
# Cross-entropy loss

class Loss_CategoricalCrossentropy(Loss):

  # Forward pass
  def forward(self, y_pred, y_true):

    # Number of samples in a batch
    samples = len(y_pred)

    # Clip data to prevent division by 0
    # Clip both sides to not drag mean towards any value
    y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

    # Probabilities of target values
    # only if categorical labels
    if len(y_true.shape) == 1:
      correct_confidences = y_pred_clipped[range(samples), y_true]
    # Mask values - only for one-hot encoded labels
    elif len(y_true.shape) == 2:
      correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

    # Losses
    negative_log_likelihoods = -np.log(correct_confidences)
    return negative_log_likelihoods

  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # Number of lables in every sample
    # We'll use the first sample to count them
    labels = len(dvalues[0])
    # If lables are sparse, turn them into one-hot vector
    if len(y_true.shape) == 1:
      y_true = np.eye(labels)[y_true]
    # Calculate gradient
    self.dinputs = - y_true /dvalues
    # Normalize gradient
    self.dinputs = self.dinputs / samples
    print(self.dinputs)

In [None]:
# Softmax Classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():
  '''
  # Creates activation and loss function objects
  def __init__(self):
    self.activation = Activation_Softmax()
    self.loss = Loss_CategoricalCrossentropy()
  
  # Forward pass
  def forward(self, inputs, y_true):
    # Output layer's activation function
    self.activation.forward(inputs)
    # Set the output
    self.output = self.activation.output
    # Calculate and return loss value
    return self.loss.calculate(self.output, y_true)
  '''
  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # If labels are one-hot encoded,
    # turn them in to discrete values
    if len(y_true.shape)==2:
      y_true = np.argmax(y_true, axis=1)
    # Copy so we can safely modify
    self.dinputs = dvalues.copy()
    # Calculate gradient
    self.dinputs[range(samples),y_true] -= 1
    # Normalize gradient
    self.dinputs = self.dinputs / samples
    

In [None]:
# Binary cross-entropy loss
class Loss_BinaryCrossentropy(Loss):
  # Forward pass
  def forward(self, y_pred, y_true):
    # Clip data to prevent division by 0 
    # Clip both sides to not drag mean torwards any value
    y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

    # Calculate sample-wise loss
    sample_losses = -(y_true * np.log(y_pred_clipped) + (1 - y_true) * np.log(1 - y_pred_clipped))
    sample_losses = np.mean(sample_losses, axis=1)

    # Return losses
    return sample_losses
  
  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # Number of outputs in every sample
    # We'll use the first sample to cout them
    outputs = len(dvalues[0])

    # Clip data to prevent division by 0 
    # Clip both sides not to drag mean towards any value
    clipped_dvalues = np.clip(dvalues, 1e-7, 1 - 1e-7)

    # Calculate gradient
    self.dinputs = -(y_true / clipped_dvalues - (1 - y_true) / (1 - clipped_dvalues)) / outputs
    # Normalize gradient
    self.dinputs = self.dinputs / samples


In [None]:
# Mean Squared Error loss
class Loss_MeanSquaredError(Loss): # L2 loss
  # Forward pass
  def forward(self, y_pred, y_true):
    # Calculate loss
    sample_losses = np.mean((y_true - y_pred)**2, axis=-1)

    # Return sample losses
    return sample_losses

  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # Number of outputs in every sample
    # We'll use the first sample to count them
    outputs = len(dvalues[0])

    # Gradient on values
    self.dinputs = -2 * (y_true - dvalues) / outputs
    # Normalize gradient
    self.dinputs = self.dinputs / samples


In [None]:
# Mean Absolute Error loss
class Loss_MeanAbsoluteError(Loss): # L1 loss
  # Forward pass
  def forward(self, y_pred, y_true):
    # Calculate loss
    sample_losses = np.mean(np.abs(y_true - y_pred), axis=-1)

    # Return losses
    return sample_losses
  
  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # Number of outputs in every sample
    # We'll use the first sample to count them
    outputs = len(dvalues[0])

    # Calculate gradient
    self.dinputs = np.sign(y_true - dvalues) / outputs
    # Normalize gradient
    self.dinputs = self.dinputs / samples


In [None]:
# SGD optimizer
class Optimizer_SGD:

  # Initialize optimizer - set settings, 
  # learning rate of 1. is default for this optimizer
  def __init__(self, learning_rate=1., decay=0., momentum=0.):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.momentum = momentum
  
  # Call once before any parameter updates
  def pre_update_params(self):
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

  # Update parameters
  def update_params(self, layer):
    
    # If we use momentum
    if self.momentum:
      # If layer does not contain momentum arrays, create them
      # filled with zeros
      if not hasattr(layer, 'weight_momentums'):
        layer.weight_momentums = np.zeros_like(layer.weights)
        # If there is no momentum array for weights
        # The array doesn't exists for biases yet either
        layer.bias_momentums = np.zeros_like(layer.biases)
      
      # Build weight updates  with momentum - take previous
      # update multiplied by retain factor and update with 
      # current gradients
      weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
      layer.weight_momentums = weight_updates

      # Build biases updates
      bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
      layer.bias_momentums = bias_updates

    # Vanilla SGD updates (as before momentum update)
    else:
      weight_updates = -self.current_learning_rate * layer.dweights
      bias_updates = -self.current_learning_rate * layer.dbiases

    # Update weights and biases using either
    # vanilla or momentum updates
    layer.weights += weight_updates
    layer.biases += bias_updates

  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# Adagrad optimizer
class Optimizer_Adagrad:

  # Initialize optimizer - set settings, 
  def __init__(self, learning_rate=1., decay=0., epsilon=1e-7):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.epsilon = epsilon
  
  # Call once before any parameter updates
  def pre_update_params(self):
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

  # Update parameters
  def update_params(self, layer):
    # If layer does not contain cache arrays, create them
    # filled with zeros
    if not hasattr(layer, 'weight_cache'):
      layer.weight_cache = np.zeros_like(layer.weights)
      layer.bias_cache = np.zeros_like(layer.biases)
    
    # Update cach with squared current gradients
    layer.weight_cache += layer.dweights**2
    layer.bias_cache += layer.dbiases**2

    # Vanilla SGD parameter update + normalization 
    # with square rooted cache
    layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
    layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# RMSprop optimizer
class Optimizer_RMSprop:
  # Initialize optimizer - set settings
  def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.epsilon = epsilon
    self.rho = rho
  
  # Call once before any parameter updates
  def pre_update_params(self): 
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
  
  # Update parameters
  def update_params(self, layer):
    # If layer does not contain cache arrays,
    # create them filled with zeros
    if not hasattr(layer, 'weight_cache'):
      layer.weight_cache = np.zeros_like(layer.weights)
      layer.bias_cache = np.zeros_like(layer.biases)
    
    # Update cache with squared current gradients
    layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
    layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

    # Vanilla SGD parameter update + normalization 
    # with square rooted cache
    layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
    layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
  
  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# Adam optimizer
class Optimizer_Adam:
  # Initialize optimizer - set settings
  def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.epsilon = epsilon
    self.beta_1 = beta_1
    self.beta_2 = beta_2
  
  # Call once before any parameter updates
  def pre_update_params(self): 
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
  
  # Update parameters
  def update_params(self, layer):
    # If layer does not contain cache arrays,
    # create them filled with zeros
    if not hasattr(layer, 'weight_cache'):
      layer.weight_momentums = np.zeros_like(layer.weights)
      layer.weight_cache = np.zeros_like(layer.weights)
      layer.bias_momentums = np.zeros_like(layer.biases)
      layer.bias_cache = np.zeros_like(layer.biases)
    
    # Update momentum with current gradients
    layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
    layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

    # Get corrected momentum 
    # self.iteration is 0 at first pass
    # and we need to start with 1 here
    weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
    bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

    # Update cache with the squared current gradients
    layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
    layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

    # Get corrected cache
    weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
    bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
    
    # Vanilla SGD parameter update + normalization 
    # with square rooted cache
    layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
    layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
    
  
  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# Common Accuracy class
class Accuracy:
  # Calculates an accuracy
  # given predictions and ground truth values 
  def calculate(self, predictions, y):
    # Get comparison results
    comparisons = self.compare(predictions, y)
    
    # Calculate an accuracy
    accuracy = np.mean(comparisons)
    
    # Return accuracy
    return accuracy

In [None]:
# Accuracy calculation for regression model
class Accuracy_Regression(Accuracy):
  def __init__(self):
    # Create precision property 
    self.precision = None
    
  # Calculates precision value
  # based on passed-in ground truth 
  def init(self, y, reinit=False):
    if self.precision is None or reinit:
      self.precision = np.std(y) / 250
  
  # Compares predictions to the ground truth value
  def compare(self, predictions, y):
    return np.absolute(predictions - y) < self.precision

In [None]:
# Accuracy calculation for classification model
class Accuracy_Categorical(Accuracy):
  def __init__(self, *, binary=False): 
    # Binary mode?
    self.binary = binary
  
  # No initialization is needed
  def init(self, y): 
    pass
  
  # Compares predictions to the ground truth values
  def compare(self, predictions, y):
    if not self.binary and len(y.shape) == 2:
      y = np.argmax(y, axis=1) 
    return predictions == y

In [None]:
# Model class
class Model:
  def __init__(self):
    # Create a list of network objects 
    self.layers = []
    # Softmax classifier's output object 
    self.softmax_classifier_output = None

  # Add objects to the model  
  def add(self, layer): 
    self.layers.append(layer)
  
  # Set loss and optimizer
  def set(self, *, loss, optimizer, accuracy): 
    self.loss = loss
    self.optimizer = optimizer
    self.accuracy = accuracy
      
  # Finalize the model
  def finalize(self):
    # Create and set the input layer
    self.input_layer = Layer_Input()
    
    # Count all the objects
    layer_count = len(self.layers)

    #  Initialize a list containing trainable layers:
    self.trainable_layers = []
   
    # Iterate the objects
    for i in range(layer_count):
      # If it's the first layer,
      # the previous layer object is the input layer
      if i == 0:
        self.layers[i].prev = self.input_layer
        self.layers[i].next = self.layers[i+1]
      # All layers except for the first and the last
      elif i < layer_count - 1:
        self.layers[i].prev = self.layers[i-1]
        self.layers[i].next = self.layers[i+1]
      # The last layer - the next object is the loss
      # Also let's save aside the reference to the last object
      # whose output is the model's output
      else:
        self.layers[i].prev = self.layers[i-1]
        self.layers[i].next = self.loss
        self.output_layer_activation = self.layers[i]

      # If layer contains an attribute called "weights",
      # it's a trainable layer -
      # add it to the list of trainable layers
      # We don't need to check for biases -
      # checking for weights is enough
      if hasattr(self.layers[i], 'weights'):
        self.trainable_layers.append(self.layers[i])
    
    # Update loss object with trainable layers
    self.loss.remember_trainable_layers(
        self.trainable_layers
    )

    # If output activation is Softmax and
    # loss function is Categorical Cross-Entropy
    # create an object of combined activation
    # and loss function containing
    # faster gradient calculation
    if isinstance(self.layers[-1], Activation_Softmax) and \
       isinstance(self.loss, Loss_CategoricalCrossentropy):
       # Create an object of combined activation
       # and loss functions
       self.softmax_classifier_output = \
                    Activation_Softmax_Loss_CategoricalCrossentropy()
  
  # Train the model
  def train(self, X, y, *, epochs=1, print_every=1, validation_data=None): 
    # Initialize accuracy object
    self.accuracy.init(y)

    # Main training loop
    for epoch in range(1, epochs+1):
      # Perform the forward pass
      output = self.forward(X, training=True)

      # Calculate loss
      data_loss, regularization_loss = \
          self.loss.calculate(output, y, include_regularization=True)

      loss = data_loss + regularization_loss

      # Get predictions and calculate an accuracy
      predictions = self.output_layer_activation.predictions(
                        output)
      accuracy = self.accuracy.calculate(predictions, y)

      # Perform backward pass
      self.backward(output, y)

      # Optimize (update parameters)
      self.optimizer.pre_update_params()
      for layer in self.trainable_layers:
        self.optimizer.update_params(layer)
      self.optimizer.post_update_params()

      # Print a summary
      if not epoch % print_every: 
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f} (' +
              f'data_loss: {data_loss:.3f}, ' +
              f'reg_loss: {regularization_loss:.3f}), ' + 
              f'lr: {self.optimizer.current_learning_rate}')
    
    # If there is the validation data
    if validation_data is not None:
      # For better readability
      X_val, y_val = validation_data
      
      # Perform the forward pass
      output = self.forward(X_val, training=False)
      
      # Calculate the loss
      loss = self.loss.calculate(output, y_val)
      
      # Get predictions and calculate an accuracy
      predictions = self.output_layer_activation.predictions(output)
      accuracy = self.accuracy.calculate(predictions, y_val)
      
      # Print a summary
      print(f'validation, ' +
            f'acc: {accuracy:.3f}, ' + 
            f'loss: {loss:.3f}')
      

      
  # Performs forward pass
  def forward(self, X, training):
    # Call forward method on the input layer
    # this will set the output property that
    # the first layer in "prev" object is expecting
    self.input_layer.forward(X, training)
    
    # Call forward method of every object in a chain
    # Pass output of the previous object as a parameter
    for layer in self.layers:
      layer.forward(layer.prev.output, training)
    
    # "layer" is now the last object from the list,
    # return its output
    return layer.output

  # Performs backward pass
  def backward(self, output, y):
    # If softmax classifier
    if self.softmax_classifier_output is not None:
      # First call backward method
      # on the combined activation/loss
      # this will set dinputs property
      self.softmax_classifier_output.backward(output, y)
      
      # Since we'll not call backward method of the last layer
      # which is Softmax activation
      # as we used combined activation/loss
      # object, let's set dinputs in this object
      self.layers[-1].dinputs = \
            self.softmax_classifier_output.dinputs
      # Call backward method going through
      # all the objects but last
      # in reversed order passing dinputs as a parameter
      for layer in reversed(self.layers[:-1]):
        layer.backward(layer.next.dinputs)
      
      return

    # First call backward method on the loss
    # this will set dinputs property that the last
    # layer will try to access shortly
    self.loss.backward(output, y)
    
    # Call backward method going through all the objects
    # in reversed order passing dinputs as a parameter
    for layer in reversed(self.layers):
      layer.backward(layer.next.dinputs)
    


In [None]:
# Create train and test dataset
X, y = spiral_data(samples=1000, classes=3)

X_test, y_test = spiral_data(samples=100, classes=3)

# Reshape labels to be a list of lists
# Inner list contains one output (either 0 or 1)
# per each output neuron, 1 in this case
#y = y.reshape(-1, 1)
#y_test = y_test.reshape(-1, 1)

# Instantiate the model
model = Model()

# Add layers
model.add(Layer_Dense(2, 512, weight_regularizer_l2=5e-4,
                             bias_regularizer_l2=5e-4)) 
model.add(Activation_ReLU())
model.add(Layer_Dropout(0.1))
model.add(Layer_Dense(512, 3))
model.add(Activation_Softmax())

# Set loss, optimizer and accuracy objects
model.set(
    loss=Loss_CategoricalCrossentropy(), 
    optimizer=Optimizer_Adam(learning_rate=0.05, decay=5e-5), 
    accuracy=Accuracy_Categorical()
    )

# Finalize the model
model.finalize()

# Train the model
model.train(X, y, validation_data=(X_test, y_test), epochs=10000, print_every=100)

epoch: 100, acc: 0.710, loss: 0.734 (data_loss: 0.671, reg_loss: 0.063), lr: 0.04975371909050202
epoch: 200, acc: 0.776, loss: 0.625 (data_loss: 0.547, reg_loss: 0.078), lr: 0.049507401356502806
epoch: 300, acc: 0.801, loss: 0.584 (data_loss: 0.506, reg_loss: 0.078), lr: 0.0492635105177595
epoch: 400, acc: 0.805, loss: 0.569 (data_loss: 0.492, reg_loss: 0.078), lr: 0.04902201088288642
epoch: 500, acc: 0.825, loss: 0.542 (data_loss: 0.464, reg_loss: 0.078), lr: 0.048782867456949125
epoch: 600, acc: 0.836, loss: 0.533 (data_loss: 0.460, reg_loss: 0.073), lr: 0.04854604592455945
epoch: 700, acc: 0.839, loss: 0.536 (data_loss: 0.459, reg_loss: 0.077), lr: 0.048311512633460556
epoch: 800, acc: 0.838, loss: 0.496 (data_loss: 0.424, reg_loss: 0.072), lr: 0.04807923457858551
epoch: 900, acc: 0.841, loss: 0.504 (data_loss: 0.434, reg_loss: 0.070), lr: 0.04784917938657352
epoch: 1000, acc: 0.836, loss: 0.499 (data_loss: 0.432, reg_loss: 0.068), lr: 0.04762131530072861
epoch: 1100, acc: 0.807, lo