<a href="https://colab.research.google.com/github/chosh84/ml_study/blob/main/NNFS/notebooks/nnfs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nnfs



In [None]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

In [None]:
# neuron class 생성

class Layer_Dense:

  # Layer initialization
  def __init__(self, n_inputs, n_neurons,
               weight_regularizer_l1=0, weight_regularizer_l2=0,
               bias_regularizer_l1=0, bias_regularizer_l2=0):
    #Initialize weights and biases
    self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
    self.biases = np.zeros((1, n_neurons))
    # Set regularization strength
    self.weight_regularizer_l1 = weight_regularizer_l1
    self.weight_regularizer_l2 = weight_regularizer_l2
    self.bias_regularizer_l1 = bias_regularizer_l1
    self.bias_regularizer_l2 = bias_regularizer_l2

  def forward(self, inputs):
    # Remembering inputs for calculating backward pass of weights
    self.inputs = inputs
    # Calculate output values from inputs, weights and biases
    self.output = np.dot(inputs, self.weights) + self.biases
  
  #backward pass
  def backward(self, dvalues):
    # Gradients on parameters
    self.dweights = np.dot(self.inputs.T, dvalues)
    self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
    
    # Gradients on regularization
    # L1 on weights
    if self.weight_regularizer_l1 > 0:
      dL1 = np.ones_like(self.weights)
      dL1[self.weights < 0] = -1
      self.dweights += self.weight_regularizer_l1 * dL1
    
    # L1 on biases
    if self.bias_regularizer_l1 > 0:
      dL1 = np.ones_like(self.biases)
      dL1[self.biases < 0] = -1
      self.dbiases += self.bias_regularizer_l1 * dL1

    # L2 on weights
    if self.weight_regularizer_l2 > 0:
      self.dweights += 2 * self.weight_regularizer_l2 * self.weights
    
    # L2 on biases
    if self.bias_regularizer_l2 > 0:
      self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
    
    # Gradients on values
    self.dinputs = np.dot(dvalues, self.weights.T)

In [None]:
# Dropout
class Layer_Dropout:
  # Init
  def __init__(self, rate):
    # Store rate, we invert it as for example for dropout 
    # of 0.1 we need success rate of 0.9
    self.rate = 1 - rate
  
  # Forward pass
  def forward(self, inputs):
    # Save input values
    self.inputs = inputs
    # Generate and save scaled mask
    self.binary_mask = np.random.binomial(1, self.rate, size=inputs.shape) / self.rate
    # Apply mask to output values
    self.output = inputs * self.binary_mask
  
  # Backward pass
  def backward(self, dvalues):
    # Gradient on values
    self.dinputs = dvalues * self.binary_mask

In [None]:
# ReLU Activiation class 생성

class Activation_ReLU:

  #Forward pass
  def forward(self, inputs):
    # Remembering inputs for calculating backward pass of weights
    self.inputs = inputs
    # Calculate output values from input
    self.output = np.maximum(0, inputs)

  # Backward pass
  def backward(self, dvalues):
    # Since we need to modify the original variables,
    # let's make a copy of the values first
    self.dinputs = dvalues.copy()

    # Zero gradient where input values are negetive
    self.dinputs[self.inputs <= 0] = 0

In [None]:
# Softmax Activation class 생성
# output 간의 확율로 변환 (총합이 1인 확율로 변환)

class Activation_Softmax:

  # Forward pass
  def forward(self, inputs):
    # Get unnormalized probabilities
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

    # Normalize them for each sample
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

    self.output =probabilities

  # Backward pass
  def backward(self, dvalues):
    # Create uninitialized array
    self.dinputs = np.empty_like(dvalues)
    # Enumerate outputs and gradients
    for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
      # Flatten output array
      single_output = single_output.reshape(-1, 1)
      # Calculate Jacobian matrix of the output
      jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
      # Calculate sample-wise gradient
      # and add it to the array of sample gradients
      self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)



In [None]:
# Common loss class
class Loss:

  # Calulates the data and regularization losses
  # given model output and ground truth values
  def calculate(self, output, y):

    # Calculate sample losses
    sample_losses = self.forward(output, y)

    # Calculate mean loss
    data_loss = np.mean(sample_losses)

    # Return loss
    return data_loss

  # Regularization loss calculation
  def regularization_loss(self, layer):
    # 0 by default
    regularization_loss = 0

    # L1 regularization - weights
    # calculate only when factor greater than 0
    if layer.weight_regularizer_l1 > 0:
      regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
    
    # L2 regularization - weights
    if layer.weight_regularizer_l2 > 0:
      regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)

    # L1 regularization - bias
    # calculate only when factor greater than 0
    if layer.bias_regularizer_l1 > 0:
      regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
    
    # L2 regularization - bias
    if layer.bias_regularizer_l2 > 0:
      regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

    return regularization_loss

In [None]:
# Cross-entropy loss

class Loss_CategoricalCrossentropy(Loss):

  # Forward pass
  def forward(self, y_pred, y_true):

    # Number of samples in a batch
    samples = len(y_pred)

    # Clip data to prevent division by 0
    # Clip both sides to not drag mean towards any value
    y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

    # Probabilities of target values
    # only if categorical labels
    if len(y_true.shape) == 1:
      correct_confidences = y_pred_clipped[range(samples), y_true]
    # Mask values - only for one-hot encoded labels
    elif len(y_true.shape) == 2:
      correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

    # Losses
    negative_log_likelihoods = -np.log(correct_confidences)
    return negative_log_likelihoods

  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # Number of lables in every sample
    # We'll use the first sample to count them
    labels = len(dvalues[0])
    # If lables are sparse, turn them into one-hot vector
    if len(y_true.shape) == 1:
      y_true = np.eye(labels)[y_true]
    # Calculate gradient
    self.dinputs = - y_true /dvalues
    # Normalize gradient
    self.dinputs = self.dinputs / samples
    print(self.dinputs)

In [None]:
# Softmax Classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():
  # Creates activation and loss function objects
  def __init__(self):
    self.activation = Activation_Softmax()
    self.loss = Loss_CategoricalCrossentropy()
  
  # Forward pass
  def forward(self, inputs, y_true):
    # Output layer's activation function
    self.activation.forward(inputs)
    # Set the output
    self.output = self.activation.output
    # Calculate and return loss value
    return self.loss.calculate(self.output, y_true)

  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # If labels are one-hot encoded,
    # turn them in to discrete values
    if len(y_true.shape)==2:
      y_true = np.argmax(y_true, axis=1)
    # Copy so we can safely modify
    self.dinputs = dvalues.copy()
    # Calculate gradient
    self.dinputs[range(samples),y_true] -= 1
    # Normalize gradient
    self.dinputs = self.dinputs / samples
    

In [None]:
# SGD optimizer
class Optimizer_SGD:

  # Initialize optimizer - set settings, 
  # learning rate of 1. is default for this optimizer
  def __init__(self, learning_rate=1., decay=0., momentum=0.):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.momentum = momentum
  
  # Call once before any parameter updates
  def pre_update_params(self):
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

  # Update parameters
  def update_params(self, layer):
    
    # If we use momentum
    if self.momentum:
      # If layer does not contain momentum arrays, create them
      # filled with zeros
      if not hasattr(layer, 'weight_momentums'):
        layer.weight_momentums = np.zeros_like(layer.weights)
        # If there is no momentum array for weights
        # The array doesn't exists for biases yet either
        layer.bias_momentums = np.zeros_like(layer.biases)
      
      # Build weight updates  with momentum - take previous
      # update multiplied by retain factor and update with 
      # current gradients
      weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
      layer.weight_momentums = weight_updates

      # Build biases updates
      bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
      layer.bias_momentums = bias_updates

    # Vanilla SGD updates (as before momentum update)
    else:
      weight_updates = -self.current_learning_rate * layer.dweights
      bias_updates = -self.current_learning_rate * layer.dbiases

    # Update weights and biases using either
    # vanilla or momentum updates
    layer.weights += weight_updates
    layer.biases += bias_updates

  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# Adagrad optimizer
class Optimizer_Adagrad:

  # Initialize optimizer - set settings, 
  def __init__(self, learning_rate=1., decay=0., epsilon=1e-7):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.epsilon = epsilon
  
  # Call once before any parameter updates
  def pre_update_params(self):
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

  # Update parameters
  def update_params(self, layer):
    # If layer does not contain cache arrays, create them
    # filled with zeros
    if not hasattr(layer, 'weight_cache'):
      layer.weight_cache = np.zeros_like(layer.weights)
      layer.bias_cache = np.zeros_like(layer.biases)
    
    # Update cach with squared current gradients
    layer.weight_cache += layer.dweights**2
    layer.bias_cache += layer.dbiases**2

    # Vanilla SGD parameter update + normalization 
    # with square rooted cache
    layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
    layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# RMSprop optimizer
class Optimizer_RMSprop:
  # Initialize optimizer - set settings
  def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.epsilon = epsilon
    self.rho = rho
  
  # Call once before any parameter updates
  def pre_update_params(self): 
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
  
  # Update parameters
  def update_params(self, layer):
    # If layer does not contain cache arrays,
    # create them filled with zeros
    if not hasattr(layer, 'weight_cache'):
      layer.weight_cache = np.zeros_like(layer.weights)
      layer.bias_cache = np.zeros_like(layer.biases)
    
    # Update cache with squared current gradients
    layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
    layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

    # Vanilla SGD parameter update + normalization 
    # with square rooted cache
    layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
    layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
  
  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# Adam optimizer
class Optimizer_Adam:
  # Initialize optimizer - set settings
  def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
    self.learning_rate = learning_rate
    self.current_learning_rate = learning_rate
    self.decay = decay
    self.iterations = 0
    self.epsilon = epsilon
    self.beta_1 = beta_1
    self.beta_2 = beta_2
  
  # Call once before any parameter updates
  def pre_update_params(self): 
    if self.decay:
      self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
  
  # Update parameters
  def update_params(self, layer):
    # If layer does not contain cache arrays,
    # create them filled with zeros
    if not hasattr(layer, 'weight_cache'):
      layer.weight_momentums = np.zeros_like(layer.weights)
      layer.weight_cache = np.zeros_like(layer.weights)
      layer.bias_momentums = np.zeros_like(layer.biases)
      layer.bias_cache = np.zeros_like(layer.biases)
    
    # Update momentum with current gradients
    layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
    layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

    # Get corrected momentum 
    # self.iteration is 0 at first pass
    # and we need to start with 1 here
    weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
    bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

    # Update cache with the squared current gradients
    layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
    layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

    # Get corrected cache
    weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
    bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
    
    # Vanilla SGD parameter update + normalization 
    # with square rooted cache
    layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
    layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
    
  
  # Call once after any parameter updates
  def post_update_params(self):
    self.iterations += 1

In [None]:
# 2차코드
#dataset 가지고 오기
X, y = spiral_data(samples=1000, classes=3)

# Create Dense layer with 2 input features and 64 output values
#dense1 = Layer_Dense(2,64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)
dense1 = Layer_Dense(2,512, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create dropout layer
dropout1 = Layer_Dropout(0.1)

# ReLU Activation 만들기
activation1 = Activation_ReLU()

# Create secon Dense layer with 64 input features (as we take output of previous layer here) and 3 output values
#dense2 = Layer_Dense(64,3)
dense2 = Layer_Dense(512,3)

# Create loss function
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create optimizer
#optimizer = Optimizer_SGD(decay=1e-3, momentum=0.9)
#optimizer = Optimizer_Adagrad(decay=1e-4)
#optimizer = Optimizer_RMSprop(learning_rate=0.02, decay=1e-5, rho=0.999)
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-5)

# Train in loop
for epoch in range(10001):
  # Make a forward pass of our training data through this layer
  dense1.forward(X)

  # Forward pass through activation function
  # takes in output from previous layer
  activation1.forward(dense1.output)

  # Perform a forward pass through Dropout layer
  dropout1.forward(activation1.output)
  
  # Perform a forward pass through second Dense layer
  # takes outputs of activation function of first layer as inputs
  dense2.forward(dropout1.output)# Perform a forward pass through Dropout layer

  # Make a forward pass through second Dense layer
  # it takes output of activation function of first layer as intputs
  #dense2.forward(activation1.output)

  # Calculate loss from output of activation2 so softmax activation
  data_loss = loss_activation.forward(dense2.output, y)

  # Calculate regularization penalty
  regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)

  # Calculate overall loss
  loss = data_loss + regularization_loss

  # Calculate accuracy from output of activation2 and targets
  # calculate values along the first axis
  predictions = np.argmax(loss_activation.output, axis=1)
  if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
  accuracy = np.mean(predictions == y)

  # Print accuracy
  if not epoch % 100:
    print(f'epoch: {epoch}, ' + f'acc: {accuracy:.3f},  ' + 
          f'loss: {loss:.3f}, (' + 
          f'data_loss: {data_loss:.3f}, ' + 
          f'reg_loss: {regularization_loss:.3f}), ' + 
          f'lr: {optimizer.current_learning_rate}')

  # Backward pass
  loss_activation.backward(loss_activation.output, y)
  dense2.backward(loss_activation.dinputs)
  dropout1.backward(dense2.dinputs)
  activation1.backward(dropout1.dinputs)
  #activation1.backward(dense2.dinputs)
  dense1.backward(activation1.dinputs)

  # Update weights and biases
  optimizer.pre_update_params()
  optimizer.update_params(dense1)
  optimizer.update_params(dense2)
  optimizer.post_update_params()

epoch: 0, acc: 0.373,  loss: 1.099, (data_loss: 1.099, reg_loss: 0.000), lr: 0.05
epoch: 100, acc: 0.715,  loss: 0.733, (data_loss: 0.672, reg_loss: 0.062), lr: 0.04975371909050202
epoch: 200, acc: 0.783,  loss: 0.621, (data_loss: 0.542, reg_loss: 0.079), lr: 0.049507401356502806
epoch: 300, acc: 0.813,  loss: 0.582, (data_loss: 0.501, reg_loss: 0.081), lr: 0.0492635105177595
epoch: 400, acc: 0.821,  loss: 0.568, (data_loss: 0.488, reg_loss: 0.080), lr: 0.04902201088288642
epoch: 500, acc: 0.832,  loss: 0.536, (data_loss: 0.458, reg_loss: 0.078), lr: 0.048782867456949125
epoch: 600, acc: 0.831,  loss: 0.527, (data_loss: 0.449, reg_loss: 0.078), lr: 0.04854604592455945
epoch: 700, acc: 0.805,  loss: 0.537, (data_loss: 0.463, reg_loss: 0.074), lr: 0.048311512633460556
epoch: 800, acc: 0.835,  loss: 0.497, (data_loss: 0.424, reg_loss: 0.073), lr: 0.04807923457858551
epoch: 900, acc: 0.842,  loss: 0.477, (data_loss: 0.405, reg_loss: 0.072), lr: 0.04784917938657352
epoch: 1000, acc: 0.820, 

In [None]:
# Validate the model

# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)

# Perform a forward pass of our testing data through this layer 
dense1.forward(X_test)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)

# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)

# Calculate accuracy from output of activation2 and targets # calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)

if len(y_test.shape) == 2:
  y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')


validation, acc: 0.857, loss: 0.399


In [None]:
'''
# 기존코드
#dataset 가지고 오기
X, y = spiral_data(samples=100, classes=3)

# Dense layer 만들기 2 input features and 3 output values
dense1 = Layer_Dense(2,3)

# ReLU Activation 만들기
activation1 = Activation_ReLU()

# Create second Dense Layer with 3 input features (as we take output of previous layer here) and 3 output values
dense2 = Layer_Dense(3,3)

# Create Softmax activation (to be used with Dense Layer)
#activation2 = Activation_Softmax()

# Create loss function
#loss_function = Loss_CategoricalCrossentropy()
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Make a forward pass of our training data through this layer
dense1.forward(X)

# Forward pass through activation function
# takes in output from previous layer
activation1.forward(dense1.output)

# Make a forward pass through second Dense layer
# it takes output of activation function of first layer as intputs
dense2.forward(activation1.output)

# Make a forward pass through activation function
# it takes output of second Dense layer here
#activation2.forward(dense2.output)

#print(dense1.output[:5])
#print(activation2.output[:5])

# Perform a forward pass through activation function
# it takes the output of second dense layer here and returns loss
#loss = loss_function.calculate(activation2.output, y)

loss = loss_activation.forward(dense2.output, y)
print(loss_activation.output[:5])

print('loss: ', loss)

# Calculate accuracy from output of activation2 and targets
# calculate values along the first axis
#predictions = np.argmax(activation2.output, axis=1)
predictions = np.argmax(loss_activation.output, axis=1)
if len(y.shape) == 2:
  y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)

# Print accuracy
print('acc: ', accuracy)

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

# Print graidents
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)
'''

"\n# 기존코드\n#dataset 가지고 오기\nX, y = spiral_data(samples=100, classes=3)\n\n# Dense layer 만들기 2 input features and 3 output values\ndense1 = Layer_Dense(2,3)\n\n# ReLU Activation 만들기\nactivation1 = Activation_ReLU()\n\n# Create second Dense Layer with 3 input features (as we take output of previous layer here) and 3 output values\ndense2 = Layer_Dense(3,3)\n\n# Create Softmax activation (to be used with Dense Layer)\n#activation2 = Activation_Softmax()\n\n# Create loss function\n#loss_function = Loss_CategoricalCrossentropy()\nloss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()\n\n# Make a forward pass of our training data through this layer\ndense1.forward(X)\n\n# Forward pass through activation function\n# takes in output from previous layer\nactivation1.forward(dense1.output)\n\n# Make a forward pass through second Dense layer\n# it takes output of activation function of first layer as intputs\ndense2.forward(activation1.output)\n\n# Make a forward pass through activat

In [None]:
# Test the new sombined loss and activation class
'''
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])


class_targets = np.array([0,1,1])

softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
softmax_loss.backward(softmax_outputs, class_targets)
dvalues1 = softmax_loss.dinputs

activation = Activation_Softmax()
activation.output = softmax_outputs
loss = Loss_CategoricalCrossentropy()
loss.backward(softmax_outputs, class_targets)
activation.backward(loss.dinputs)
dvalues2 = activation.dinputs

print('Gradients: combined loss and activation')
print(dvalues1)
print('Gradients: seperate loss and activation')
print(dvalues2)
'''

"\nsoftmax_outputs = np.array([[0.7, 0.1, 0.2],\n                            [0.1, 0.5, 0.4],\n                            [0.02, 0.9, 0.08]])\n\n\nclass_targets = np.array([0,1,1])\n\nsoftmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()\nsoftmax_loss.backward(softmax_outputs, class_targets)\ndvalues1 = softmax_loss.dinputs\n\nactivation = Activation_Softmax()\nactivation.output = softmax_outputs\nloss = Loss_CategoricalCrossentropy()\nloss.backward(softmax_outputs, class_targets)\nactivation.backward(loss.dinputs)\ndvalues2 = activation.dinputs\n\nprint('Gradients: combined loss and activation')\nprint(dvalues1)\nprint('Gradients: seperate loss and activation')\nprint(dvalues2)\n"