In [50]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import nnfs
from nnfs.datasets import sine_data, spiral_data
import random
import requests
from NNS import NeuralNetwork as NN #import neural net code from github to reduce copy/pasting
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier

# Model Object

In [45]:
# Changes to Activation Functions


"""""""""""""""""""""""""""""""""""
Activation Functions
"""""""""""""""""""""""""""""""""""

#Relu Activation
## On/off linear function, easy to optimize
## Most popular, the "go-to" function
## Can cause dying neurons 
class Activation_ReLU:
    
    # Forward Pass
    def forward(self, inputs, training):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
        
    # Backward Pass
    def backward(self, dvalues):
        self.dinputs = dvalues.copy() # don't want to modify original values
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        
    # calculate predictions
    def predictions(self, outputs):
        return outputs
        
#Softmax Activation 
## Typically used in the last hidden layer
## Calculates the probabilty distribution over 'n' different events
## Dependant probabilites, sum of proabilites  = 1
class Activation_Softmax:
    
    # Forward Pass
    def forward(self, inputs, training):
        # Remember input values
        self.inpus = inputs
        
        #Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims = True))
        # Normalize them for each sample
        probabilites = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        
        self.output = probabilites
    
    # Backward Pass
    def backward(self, dvalues):    
        
        # Create uninitialized array
        self.dinputs=np.empty_like(dvalues)
        
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            #Flatten output array
            single_output = single_output.reshape(-1,1)
            #Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            #Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
            
    # Calculate prediction
    def predictions(self, outputs):
        return np.argmax(outputs, axis=1)
            
# Sigmoid Activation
## Creates values between 0 and 1
## Difficult optimization becuase it is not 0-centered  
## Good for classification
## Independant probabilities =, sum of probabilities not necessarily equal to 1
class Activate_Sigmoid:
    
    # Add confidence value needed for model to consider a 'pass'
    def __init__(self, confidence = 0.5):
        self.confidence = confidence
    
    #Forward Pass
    def forward(self, inputs, training):
        # Save input and calculate/save output of sigmoid function
        self.inputs = inputs
        self.output = 1/(1+np.exp(-inputs))
            
    # Backward pass
    def backward(self, dvalues):
        # Derivative - calculates from output of the sigmoid function
        self.dinputs = dvalues * (1 - self.output) * self.output
        
    # Calculate predictions
    def predictions(self, outputs):
        return (outputs > self.confidence) * 1
        
    
# Linear activation function
# Most basic regression activation function    
class Activation_Linear:
    
    # Forward pass
    def forward(self, inputs, training):
        # Just remember values
        self.inputs = inputs
        self.output = inputs
        
    # Backward Pass
    def backward(self, dvalues):
        # deriviative of linear function is 1
        self.dinputs = dvalues.copy()
        
    # Calculate predictions
    def predictions(self, outputs):
        return outputs



In [46]:
# Changes to general loss function      

"""""""""""""""""""""""""""""""""""
Loss Functions
"""""""""""""""""""""""""""""""""""

#Common Loss Class
class Loss:
    
    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate ( self, output, y, *, include_regularization = False):
        
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        
        # If just data loss - return it
        if not include_regularization:
            return data_loss
        
        # Return the data and regularization losses
        return data_loss, self.regularization_loss()
    
    # Regularization loss calculation
    def regularization_loss(self):
        
        # 0 by default
        regularization_loss = 0
        
        # Calculate regulariation loss
        # iterate all trainable layers
        
        for layer in self.trainable_layers:
            
            # L1 regularization - weights
            if layer.weight_regularizer_l1 > 0:
                regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
            
            # L2 regularization - weights
            if layer.weight_regularizer_l2 > 0:
                regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
                
            # L1 regularization - biases
            if layer.bias_regularizer_l1 > 0:
                regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
                
            # L2 regularization - biases
            if layer.bias_regularizer_l2 > 0:
                regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)
            
        return regularization_loss
    
    # Set/remember trainable layers
    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers
    
# Categorical Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    #Forward Pass
    def forward(self, y_pred, y_true):
            
        #Number of samples in a batch
        samples = len(y_pred)
            
        # Clip data to prevent division by 0
        # Clip both sides to not affect mean
        y_pred_clipped = np.clip(y_pred, 1e-7, 1- 1e-7)
           
        # Probabilities for target values
        # only if categorical labels
        if len(y_true.shape)==1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]
                
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis = 1
            )
                
        #Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
        
     # Backward pass
    def backward ( self , dvalues , y_true ):
        
        # Number of samples
        samples = len (dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len (dvalues[ 0 ])
        
        # If labels are sparse, turn them into one-hot vector
        if len (y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
            
        # Calculate gradient
        self.dinputs = - y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples
        
# Binary Cross-entropy loss
class Loss_BinaryCrossentropy(Loss):
    
    # Forward Pass
    def forward(self, y_pred, y_true):
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7 , 1 - 1e-7 )
        
        # Calculate sample-wise loss
        sample_losses = -(y_true * np.log(y_pred_clipped) +
                          (1 - y_true) * np.log(1 - y_pred_clipped))
        sample_losses = np.mean(sample_losses, axis=-1)

        return sample_losses
    
    #Backward Pass
    def backward(self, dvalues, y_true):
        
        # Number of samples
        samples = len(dvalues)
        # Number of outputs in every sample
        outputs = len(dvalues[0])
        
        # Clip the data to prevent division by 0
        clipped_dvalues = np.clip(dvalues, 1e-7 , 1 - 1e-7 )
        
        # Calculate gradient
        self.dinputs = -(y_true / clipped_dvalues - 
                        (1 - y_true) / (1 - clipped_dvalues)) / outputs
        # Normailize gradient
        self.dinputs = self.dinputs / samples
    
   
# Mean Squared Error (L2) Loss
class Loss_MeanSquaredError(Loss): # L2 loss
    
    # Forward Pass
    def forward(self, y_pred, y_true):
        
        # Calculate loss
        sample_losses = np.mean((y_true - y_pred)**2, axis=-1)
        return sample_losses
    
    def backward(self, dvalues, y_true):
        
        # Number of Samples
        samples = len(dvalues)
        outputs = len(dvalues[0])
        
        #Gradient on values
        self.dinputs = -2 * (y_true - dvalues) / outputs
        # Normalize Gradient
        self.dinputs = self.dinputs / samples
        
# Mean Absolute Error (L1) Loss        
class Loss_MeanAbsoluteError(Loss): # L1 loss
    
    # Forward Pass
    def forward(self, y_pred, y_true):
        
        # Calculate loss
        sample_losses = np.mean(np.abs(y_true - y_pred), axis = -1)
        
        # Return losses
        return sample_losses
    
    # Backward pass
    def backward(self, dvalues, y_true):
        
        # Number of samples
        samples = len(dvalues)
        outputs = len(dvalues[0])
        
        # Calculate Gradient
        self.dinputs = np.sign(y_true - dvalues) / outputs
        # Normalize Gradient
        self.dinputs = self.dinputs / samples
        
"""""""""""""""""""""""""""""""""""
Combined Activation and Loss Functions
"""""""""""""""""""""""""""""""""""
# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step

class Activation_Softmax_Loss_CategoricalCrossentropy():
    
    # Backward pass
    def backward(self, dvalues, y_true):
        
        # Number of samples
        samples = len(dvalues)
        
        # If labels are one-hot encoded,
        # turn them into discrete values
        if len (y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1 )

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[ range (samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

In [54]:
#Model Class and additional classes

"""""""""""""""""""""""""""""""""""
Model Class
"""""""""""""""""""""""""""""""""""

# Model class
class Model :
    
    def __init__ (self):
        # Create a list of network objects
        self.layers = []
        # Softmax classifier's output object
        self.softmax_classifier_output = None
        
    # Add objects to the model
    def add (self, layer):
        self.layers.append(layer)
        
    # Set loss, optimizer and accuracy
    def set (self, *, loss, optimizer, accuracy):
        self.loss = loss
        self.optimizer = optimizer
        self.accuracy = accuracy
        
    # Finalize the model
    def finalize (self):
        
        # Create and set the input layer
        self.input_layer = Layer_Input()
        
        # Count all the objects
        layer_count = len(self.layers)
        
        # Initialize a list containing trainable layers:
        self.trainable_layers = []
        
        # Iterate the objects
        for i in range(layer_count):
            
            # If it's the first layer,
            # the previous layer object is the input layer
            if i == 0 :
                self.layers[i].prev = self.input_layer
                self.layers[i].next = self.layers[i + 1]
                
            # All layers except for the first and the last
            elif i < layer_count - 1 :
                self.layers[i].prev = self.layers[i - 1] 
                self.layers[i].next = self.layers[i + 1]
                
            # The last layer - the next object is the loss
            # Also let's save aside the reference to the last object
            # whose output is the model's output
            else :
                self.layers[i].prev = self.layers[i - 1]
                self.layers[i].next = self.loss
                self.output_layer_activation = self.layers[i]
                
            # If layer contains an attribute called "weights",
            # it's a trainable layer -
            # add it to the list of trainable layers
            # We don't need to check for biases -
            # checking for weights is enough
            if hasattr(self.layers[i], 'weights'):
                self.trainable_layers.append(self.layers[i])
                
                # Update loss object with trainable layers
                self.loss.remember_trainable_layers(
                    self.trainable_layers
                    )
                
                
            # If output activation is Softmax and
            # loss function is Categorical Cross-Entropy
            # create an object of combined activation
            # and loss function containing
            # faster gradient calculation
            if isinstance(self.layers[ - 1 ], Activation_Softmax) and \
                isinstance(self.loss, Loss_CategoricalCrossentropy):
                # Create an object of combined activation
                # and loss functions
                self.softmax_classifier_output = Activation_Softmax_Loss_CategoricalCrossentropy()
        
    # Train the model
    def train(self, X, y, *, epochs = 1, print_every = 1, validation_data = None):
        
        # Initialize accuracy object
        self.accuracy.init(y)
        
        # Main training loop
        for epoch in range (1, epochs+1):

            # Perform the forward pass
            output = self.forward(X, training = True)
            
            # Calculate loss
            data_loss, regularization_loss = self.loss.calculate(output, y, include_regularization = True)
            loss = data_loss + regularization_loss
            
            # Get predictions and calculate an accuracy
            predictions = self.output_layer_activation.predictions(output)
            accuracy = self.accuracy.calculate(predictions, y)
            
            # Perform backward pass
            self.backward(output, y)
            
            # Optimize (update parameters)
            self.optimizer.pre_update_params()
            for layer in self.trainable_layers:
                self.optimizer.update_params(layer)
            self.optimizer.post_update_params()
            
            # Print a summary
            if not epoch % print_every:
                print ( f'epoch: {epoch} , ' +
                        f'acc: {accuracy :.3f}, ' +
                        f'loss: {loss :.3f} (' +
                        f'data_loss: {data_loss :.3f} , ' +
                        f'reg_loss: {regularization_loss :.3f}), ' +
                        f'lr: {self.optimizer.current_learning_rate}')
                
                # If there is the validation data
                if validation_data is not None:
                    # For better readability
                    X_val, y_val = validation_data
                    
                    # Perform the forward pass
                    output = self.forward(X_val, training=False)
                    
                    # Calculate the loss
                    loss = self.loss.calculate(output, y_val)
                    
                    # Get predictions and calculate an accuracy
                    predictions = self.output_layer_activation.predictions(output)
                    accuracy = self.accuracy.calculate(predictions, y_val)

                    # Print a summary
                    print ( f'validation, ' +
                            f'acc: {accuracy :.3f} , ' +
                            f'loss: {loss :.3f}')

    # Performs forward pass
    def forward ( self , X , training ):
        
        # Call forward method on the input layer
        # this will set the output property that
        # the first layer in "prev" object is expecting
        self.input_layer.forward(X, training)
        
        # Call forward method of every object in a chain
        # Pass output of the previous object as a parameter
        for layer in self.layers:
            layer.forward(layer.prev.output, training)
            
        # "layer" is now the last object from the list,
        # return its output
        return layer.output
    
    # Performs backward pass
    def backward ( self , output , y ):
        
        # If softmax classifier
        if self.softmax_classifier_output is not None :
            # First call backward method
            # on the combined activation/loss
            # this will set dinputs property
            self.softmax_classifier_output.backward(output, y)
            
            # Since we'll not call backward method of the last layer
            # which is Softmax activation
            # as we used combined activation/loss
            # object, let's set dinputs in this object
            self.layers[-1].dinputs = self.softmax_classifier_output.dinputs
            
            # Call backward method going through
            # all the objects but last
            # in reversed order passing dinputs as a parameter
            for layer in reversed(self.layers[:-1]):
                layer.backward(layer.next.dinputs)
                
            return

        # First call backward method on the loss
        # this will set dinputs property that the last
        # layer will try to access shortly
        self.loss.backward(output, y)
        
        # Call backward method going through all the objects
        # in reversed order passing dinputs as a parameter
        for layer in reversed (self.layers):
            layer.backward(layer.next.dinputs)
        
# Input "Layer"
class Layer_Input:
    
    # Forward pass
    def forward(self, inputs, training):
        self.output = inputs
        
        
"""""""""""""""""""""""""""""""""""
Accuracy Classes
"""""""""""""""""""""""""""""""""""

# Common Accuracy Class
class Accuracy:
    
    # Calculates an accuracy
    # given prediction and ground truth values
    def calculate(self, predictions, y):
        
        # Get comparison results
        comparisons = self.compare(predictions, y)
        
        # Calculate an accuracy
        accuracy = np.mean(comparisons)
        
        # Return accuracy
        return accuracy
    
# Accuracy calculation for regression
class Accuracy_Regression(Accuracy):
    
    def __init__(self):
        self.precision = None
        
    # Calculates precision value
    # based on passed in ground truth
    def init(self, y, reinit = False):
        if self.precision is None or reinit:
            self.precision = np.std(y) / 250
            
    # Compares predictions to ground truth
    def compare(self, predictions, y):
        return np.absolute(predictions - y) < self.precision
    
# Accuracy calculation for classification model
class Accuracy_Categorical(Accuracy):
    
    # No initialization is needed
    def init (self , y):
        pass
    
    # Compares predictions to the ground truth values
    def compare (self, predictions, y):
        if len (y.shape) == 2:
            y = np.argmax(y, axis = 1)
        return predictions == y
    

In [48]:
# Changes to Layers

"""""""""""""""""""""""""""""""""""
Layers
"""""""""""""""""""""""""""""""""""

#Dense Layer
class Layer_Dense: #Completely Random Dense Layer
    # Layer initialization
    def __init__(self, n_inputs, n_neurons,
                 weight_regularizer_l1 = 0 , weight_regularizer_l2 = 0 ,
                 bias_regularizer_l1 = 0 , bias_regularizer_l2 = 0 ):
        
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons) #initialize weights
        #Note: Multiplied by 0.01 since it is often better to have start weights that minimally affect the training
        self.biases = np.zeros((1, n_neurons)) # initialize biases to 0
        #Note: initial bias for 0 is common to ensure neuron fires 
        
        # Set regularization strength (lambdas)
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
    
    # Forward pass
    def forward (self, inputs, training):
        # Remember input values
        self.inputs = inputs
        
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases
        
    #Backward Pass
    def backward(self, dvalues):
        
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        
        # Gradient on vregularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0 ] = - 1
            self.dweights += self.weight_regularizer_l1 * dL1
        # L2 on weights
        if self.weight_regularizer_l2 > 0 :
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
            
        # L1 on biases
        if self.bias_regularizer_l1 > 0 :
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        # L2 on biases
        if self.bias_regularizer_l2 > 0 :
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
        
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

#Dropout layer
class Layer_Dropout:
    
    # Init
    def __init__(self, rate):
        # Store rate, we invert it for use in the binomial distribution
        self.rate = 1 - rate
        
    # Forward Pass
    def forward(self, inputs, training):
        # Save input values
        self.inputs = inputs
        
        # If not in the training mode - return values
        if not training:
            self.output = inputs.copy()
            return

        # Generate and save scaled mask
        self.binary_mask = np.random.binomial(1, self.rate, size = inputs.shape) / self.rate
        
        # Apply mask to output values
        self.output = inputs * self.binary_mask
        
    # Backward Pass
    def backward(self, dvalues):
        
        # Gradient on values
        self.dinputs = dvalues * self.binary_mask
        


In [49]:
# Regression Example
X,y = sine_data()

model = Model()

#Add layers
model.add(Layer_Dense(1,64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64,64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64,1))
model.add(Activation_Linear())

# Set loss, optimizer and accuracy objects
model.set(
    loss = Loss_MeanSquaredError(),
    optimizer = NN.Optimizer_Adam( learning_rate = 0.005 , decay = 1e-3 ),
    accuracy = Accuracy_Regression()
)

# Finalize the model
model.finalize()
# Train the model
model.train(X, y, epochs = 10000 , print_every = 1000)

epoch: 1000 , acc: 0.531, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.002501250625312656
epoch: 2000 , acc: 0.580, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.0016672224074691564
epoch: 3000 , acc: 0.536, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.0012503125781445363
epoch: 4000 , acc: 0.602, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.0010002000400080014
epoch: 5000 , acc: 0.610, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.0008334722453742291
epoch: 6000 , acc: 0.612, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.000714387769681383
epoch: 7000 , acc: 0.613, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.0006250781347668457
epoch: 8000 , acc: 0.612, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.00055561729081009
epoch: 9000 , acc: 0.615, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.0005000500050005
epoch: 10000 , acc: 0.616, loss: 0.031 (data_loss: 0.031 , reg_loss: 0.000), lr: 0.000454

In [55]:
# Classification Example
X, y = spiral_data( samples = 1000 , classes = 3 )
X_test, y_test = spiral_data( samples = 100 , classes = 3 )

# Instantiate the model
model = Model()

# Add layers
model.add(Layer_Dense( 2 , 512 , weight_regularizer_l2 = 5e-4 ,
bias_regularizer_l2 = 5e-4 ))
model.add(Activation_ReLU())
model.add(Layer_Dropout( 0.1 ))
model.add(Layer_Dense( 512 , 3 ))
model.add(Activation_Softmax())

# Set loss, optimizer and accuracy objects
model.set(
    loss = Loss_CategoricalCrossentropy(),
    optimizer = NN.Optimizer_Adam( learning_rate = 0.05 , decay = 5e-5 ),
    accuracy = Accuracy_Categorical()
)

# Finalize the model
model.finalize()

# Train the model
model.train(X, y, validation_data = (X_test, y_test),
            epochs = 10000 , print_every = 1000 )

epoch: 1000 , acc: 0.858, loss: 0.486 (data_loss: 0.407 , reg_loss: 0.079), lr: 0.04762131530072861
validation, acc: 0.873 , loss: 0.369
epoch: 2000 , acc: 0.850, loss: 0.485 (data_loss: 0.418 , reg_loss: 0.067), lr: 0.045456611664166556
validation, acc: 0.863 , loss: 0.350
epoch: 3000 , acc: 0.853, loss: 0.503 (data_loss: 0.445 , reg_loss: 0.058), lr: 0.043480151310926564
validation, acc: 0.883 , loss: 0.338
epoch: 4000 , acc: 0.860, loss: 0.449 (data_loss: 0.389 , reg_loss: 0.061), lr: 0.04166840285011875
validation, acc: 0.900 , loss: 0.331
epoch: 5000 , acc: 0.856, loss: 0.451 (data_loss: 0.398 , reg_loss: 0.053), lr: 0.04000160006400256
validation, acc: 0.873 , loss: 0.323
epoch: 6000 , acc: 0.864, loss: 0.455 (data_loss: 0.399 , reg_loss: 0.055), lr: 0.03846301780837725
validation, acc: 0.897 , loss: 0.331
epoch: 7000 , acc: 0.871, loss: 0.428 (data_loss: 0.377 , reg_loss: 0.051), lr: 0.03703840882995667
validation, acc: 0.873 , loss: 0.323
epoch: 8000 , acc: 0.864, loss: 0.451 (