In [9]:
import numpy as np
import matplotlib.pyplot
import nnfs
from nnfs.datasets import spiral_data
from NNS import NeuralNetwork as NN #import neural net code from github to reduce copy/pasting

# Chapter 9

## Backpropagation


In [None]:
# Forward pass
x = [ 1.0 , - 2.0 , 3.0 ] # input values
w = [ - 3.0 , - 1.0 , 2.0 ] # weights
b = 1.0 # bias

# Multiplying inputs by weights
xw0 = x[ 0 ] * w[ 0 ]
xw1 = x[ 1 ] * w[ 1 ]
xw2 = x[ 2 ] * w[ 2 ]
# Adding weighted inputs and a bias
z = xw0 + xw1 + xw2 + b
# ReLU activation function
y = max (z, 0 )


# Backward pass
# The derivative from the next layer
dvalue = 1.0
# Derivative of ReLU and the chain rule
drelu_dz = dvalue * ( 1. if z > 0 else 0. )
print (drelu_dz)

# Partial derivatives of the multiplication, the chain rule
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1
drelu_dxw0 = drelu_dz * dsum_dxw0
drelu_dxw1 = drelu_dz * dsum_dxw1
drelu_dxw2 = drelu_dz * dsum_dxw2
drelu_db = drelu_dz * dsum_db
print (drelu_dxw0, drelu_dxw1, drelu_dxw2, drelu_db)

# Partial derivatives of the multiplication, the chain rule
dmul_dx0 = w[ 0 ]
dmul_dx1 = w[ 1 ]
dmul_dx2 = w[ 2 ]
dmul_dw0 = x[ 0 ]
dmul_dw1 = x[ 1 ]
dmul_dw2 = x[ 2 ]
drelu_dx0 = drelu_dxw0 * dmul_dx0
drelu_dw0 = drelu_dxw0 * dmul_dw0
drelu_dx1 = drelu_dxw1 * dmul_dx1
drelu_dw1 = drelu_dxw1 * dmul_dw1
drelu_dx2 = drelu_dxw2 * dmul_dx2
drelu_dw2 = drelu_dxw2 * dmul_dw2
print (drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)


1.0
1.0 1.0 1.0 1.0
-3.0 1.0 -1.0 -2.0 2.0 3.0


### One Neuron Example

In [None]:
print(y) #Starting Weight

dx = [drelu_dx0, drelu_dx1, drelu_dx2] # gradients on inputs
dw = [drelu_dw0, drelu_dw1, drelu_dw2] # gradients on weights
db = drelu_db # gradient on bias...just 1 bias here.

print(w,b) #Current Weights and Biases

#Apply Gradient
# -0.001 Used in place of an optimizer
w[ 0 ] += - 0.001 * dw[ 0 ]
w[ 1 ] += - 0.001 * dw[ 1 ]
w[ 2 ] += - 0.001 * dw[ 2 ]
b += - 0.001 * db
print (w, b)

# Multiplying inputs by weights
xw0 = x[ 0 ] * w[ 0 ]
xw1 = x[ 1 ] * w[ 1 ]
xw2 = x[ 2 ] * w[ 2 ]
# Adding
z = xw0 + xw1 + xw2 + b
# ReLU activation function
y = max (z, 0 )
print (y) #Ending Weight

6.0
[-3.0, -1.0, 2.0] 1.0
[-3.001, -0.998, 1.997] 0.999
5.985


### Layer of Neurons Exapmle

Gradient with Respect to Input

In [None]:
# Passed in gradient from the next layer
# for the purpose of this example we're going to use
# a vector of 1s
dvalues = np.array([[ 1. , 1. , 1. ],
                    [ 2. , 2. , 2. ],
                    [ 3. , 3. , 3. ]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[ 0.2 , 0.8 , - 0.5 , 1 ],
                    [ 0.5 , - 0.91 , 0.26 , - 0.5 ],
                    [ - 0.26 , - 0.27 , 0.17 , 0.87 ]]).T

# sum weights of given input
# and multiply by the passed in gradient for this neuron
dinputs = np.dot(dvalues, weights.T) #gradient with respect to input
print (dinputs)

[[ 0.44 -0.38 -0.07  1.37]
 [ 0.88 -0.76 -0.14  2.74]
 [ 1.32 -1.14 -0.21  4.11]]


Gradient with Respect to Weights

In [19]:
# Passed in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[ 1. , 1. , 1. ],
                    [ 2. , 2. , 2. ],
                    [ 3. , 3. , 3. ]])
# We have 3 sets of inputs - samples
inputs = np.array([[ 1 , 2 , 3 , 2.5 ],
                    [ 2. , 5. , - 1. , 2 ],
                    [ - 1.5 , 2.7 , 3.3 , - 0.8 ]])
# sum weights of given input
# and multiply by the passed in gradient for this neuron
dweights = np.dot(inputs.T, dvalues) #gradient with respect to weights
print (dweights)

[[ 0.5  0.5  0.5]
 [20.1 20.1 20.1]
 [10.9 10.9 10.9]
 [ 4.1  4.1  4.1]]


Gradient with Respect to bias

In [None]:
# Passed in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[ 1. , 1. , 1. ],
                    [ 2. , 2. , 2. ],
                    [ 3. , 3. , 3. ]])
# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([[ 2 , 3 , 0.5 ]])
# dbiases - sum values, do this over samples (first axis), 
# keepdims since this by default will produce a plain list
dbiases = np.sum(dvalues, axis = 0 , keepdims = True ) #gradient with respect to bias
print (dbiases)

[[6. 6. 6.]]


Gradient with Respect to Activation (ReLU)

In [None]:
import numpy as np
# Example layer output
z = np.array([[ 1 , 2 , - 3 , - 4 ],
              [ 2 , - 7 , - 1 , 3 ],
              [ - 1 , 2 , 5 , - 1 ]])
dvalues = np.array([[ 1 , 2 , 3 , 4 ],
                    [ 5 , 6 , 7 , 8 ],
                    [ 9 , 10 , 11 , 12 ]])

# ReLU activation's derivative
# with the chain rule applied
drelu = dvalues.copy() #Don't modify original dvalues
drelu[z <= 0 ] = 0
print (drelu) #gradient with respect to activation function

[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]


Forward and Backward Pass of Full Layer and Batch-Based Partial Derivatives

In [None]:
# Passed in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[ 1. , 1. , 1. ],
                    [ 2. , 2. , 2. ],
                    [ 3. , 3. , 3. ]])

# We have 3 sets of inputs - samples
inputs = np.array([[ 1 , 2 , 3 , 2.5 ],
                   [ 2. , 5. , - 1. , 2 ],
                   [ - 1.5 , 2.7 , 3.3 , - 0.8 ]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[ 0.2 , 0.8 , - 0.5 , 1 ],
                    [ 0.5 , - 0.91 , 0.26 , - 0.5 ],
                    [ - 0.26 , - 0.27 , 0.17 , 0.87 ]]).T

# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([[ 2 , 3 , 0.5 ]])

# Forward pass
layer_outputs = np.dot(inputs, weights) + biases # Dense layer
relu_outputs = np.maximum( 0 , layer_outputs) # ReLU activation

# Let's optimize and test backpropagation here
# ReLU activation - simulates derivative with respect to input values
# from next layer passed to current layer during backpropagation
drelu = relu_outputs.copy()
drelu[layer_outputs <= 0 ] = 0 #gradient with respect to activation function

# Dense layer
# dinputs - multiply by weights
dinputs = np.dot(drelu, weights.T) #gradient with respect to input
# dweights - multiply by inputs
dweights = np.dot(inputs.T, drelu) #gradient with respect to weights
# dbiases - sum values, do this over samples (first axis)
# keepdims since this by default will produce a plain list
dbiases = np.sum(drelu, axis = 0 , keepdims = True ) #gradient with respect to biases
# Update parameters
weights += - 0.001 * dweights
biases += - 0.001 * dbiases
print (weights)
print (biases)

[[ 0.179515   0.5003665 -0.262746 ]
 [ 0.742093  -0.9152577 -0.2758402]
 [-0.510153   0.2529017  0.1629592]
 [ 0.971328  -0.5021842  0.8636583]]
[[1.98489  2.997739 0.497389]]


### Class Updates

In [28]:
#Dense Layer
class Layer_Dense: #Completely Random Dense Layer
    # Layer initialization
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons) #initialize weights
        #Note: Multiplied by 0.01 since it is often better to have start weights that minimally affect the training
        self.biases = np.zeros((1, n_neurons)) # initialize biases to 0
        #Note: initial bias for 0 is common to ensure neuron fires 
    
    #Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases
        
    #Backward Pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

#Relu Activation
class Activation_ReLU:
    
    # Forward Pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
        
    # Backward Pass
    def backward(self, dvalues):
        self.dinputs = dvalues.copy() # don't want to modify original values
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        

#Common Loss Class
class Loss:
    def calculate(self, output, y):
        #calculate sample losses
        sample_losses = self.forward(output,y)
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        
        return data_loss
    
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):
    
    # Backward pass
    def backward ( self , dvalues , y_true ):
        
        # Number of samples
        samples = len (dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len (dvalues[ 0 ])
        
        # If labels are sparse, turn them into one-hot vector
        if len (y_true.shape) == 1 :
            y_true = np.eye(labels)[y_true]
            
        # Calculate gradient
        self.dinputs = - y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples
        
    #Forward Pass
    def forward(self, y_pred, y_true):
           
        #Number of samples in a batch
        samples = len(y_pred)
                    
        # Clip data to prevent division by 0
        # Clip both sides to not affect mean
        y_pred_clipped = np.clip(y_pred, 1e-7, 1- 1e-7)
        
        # Probabilities for target values
        # only if categorical labels
        if len(y_true.shape)==1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]
                
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis = 1
            )
                
        #Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    
#Softmax Activation 
class Activation_Softmax:
    def forward(self, inputs):
        # Remember input values
        self.inpus = inputs
        #Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims = True))
        # Normalize them for each sample
        probabilites = exp_values / np.sum(exp_values, axis = 1, keepdims=True)
        
        self.output = probabilites
        
    def backward(self, dvalues):
        
        # Create uninitialized array
        self.dinputs=np.empty_like(dvalues)
        
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            #Flatten output array
            single_output = single_output.reshape(-1,1)
            #Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            #Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

#### Common Categorical Cross-Entropy loss and Softmax activation derivative

Using both Common Categorical Cross-Entropy loss and Softmax activation results in a a much simplier, much easier to calculate partials that are faster to compute

In [29]:
# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy ():
    # Creates activation and loss function objects
    def __init__ ( self ):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
    # Forward pass
    def forward ( self , inputs , y_true ):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)
    # Backward pass
    def backward ( self , dvalues , y_true ):
        # Number of samples
        samples = len (dvalues)
        # If labels are one-hot encoded,
        # turn them into discrete values
        if len (y_true.shape) == 2 :
            y_true = np.argmax(y_true, axis = 1 )
        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[ range (samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

Compare

In [36]:
softmax_outputs = np.array([[ 0.7 , 0.1 , 0.2 ],
                            [ 0.1 , 0.5 , 0.4 ],
                            [ 0.02 , 0.9 , 0.08 ]])

class_targets = np.array([0,1,1])

softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
softmax_loss.backward(softmax_outputs, class_targets)
dvalues1 = softmax_loss.dinputs

activation = Activation_Softmax()
activation.output = softmax_outputs
loss = Loss_CategoricalCrossentropy()
loss.backward(softmax_outputs, class_targets)
activation.backward(loss.dinputs)
dvalues2 = activation.dinputs

print ( 'Gradients: combined loss and activation:' )
print (dvalues1)
print ( 'Gradients: separate loss and activation:' )
print (dvalues2)

Gradients: combined loss and activation:
[[-0.1         0.03333333  0.06666667]
 [ 0.03333333 -0.16666667  0.13333333]
 [ 0.00666667 -0.03333333  0.02666667]]
Gradients: separate loss and activation:
[[-0.09999999  0.03333334  0.06666667]
 [ 0.03333334 -0.16666667  0.13333334]
 [ 0.00666667 -0.03333333  0.02666667]]


In [35]:
from timeit import timeit
softmax_outputs = np.array([[ 0.7 , 0.1 , 0.2 ],
                            [ 0.1 , 0.5 , 0.4 ],
                            [ 0.02 , 0.9 , 0.08 ]])
class_targets = np.array([ 0 , 1 , 1 ])
def f1 ():
    softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
    softmax_loss.backward(softmax_outputs, class_targets)
    dvalues1 = softmax_loss.dinputs
def f2 ():
    activation = Activation_Softmax()
    activation.output = softmax_outputs
    loss = Loss_CategoricalCrossentropy()
    loss.backward(softmax_outputs, class_targets)
    activation.backward(loss.dinputs)
    dvalues2 = activation.dinputs
    
t1 = timeit( lambda : f1(), number = 10000 )
t2 = timeit( lambda : f2(), number = 10000 )
print (t2 / t1)

8.411686871161951


### Combine Everything

In [37]:
# Create dataset
X, y = spiral_data( samples = 100 , classes = 3 )

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense( 2 , 3 )
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense( 3 , 3 )
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Perform a forward pass of our training data through this layer
dense1.forward(X)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y)

# Let's see output of the first few samples:
print (loss_activation.output[: 5 ])
# Print loss value
print ( 'loss:' , loss)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis = 1 )
if len (y.shape) == 2 :
    y = np.argmax(y, axis = 1 )
accuracy = np.mean(predictions == y)
# Print accuracy
print ( 'acc:' , accuracy)

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

# Print gradients
print (dense1.dweights)
print (dense1.dbiases)
print (dense2.dweights)
print (dense2.dbiases)

[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
loss: 1.0986104
acc: 0.34
[[ 1.5766357e-04  7.8368583e-05  4.7324400e-05]
 [ 1.8161038e-04  1.1045573e-05 -3.3096312e-05]]
[[-3.60553473e-04  9.66117223e-05 -1.03671395e-04]]
[[ 5.44109462e-05  1.07411419e-04 -1.61822361e-04]
 [-4.07913431e-05 -7.16780924e-05  1.12469446e-04]
 [-5.30112993e-05  8.58172934e-05 -3.28059905e-05]]
[[-1.0729185e-05 -9.4610732e-06  2.0027859e-05]]
