In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Activation Function (sigmoid):
$$\sigma(x) = \frac{1} {1 + e^{-x}}$$

Derivative of Activation Function:
$$\sigma'(x) = \sigma'(x)(1-\sigma'(x))$$

In [2]:
class SimpleNeuralNetwork:
    
    # Simple 2 layer neural network
    # First layer has as a node for each feature 
    # Second layer has a configurable number of nodes that map to 1 output
    def __init__(self, num_of_features, second_layer_size):
        self.weights0 = np.random.rand(num_of_features, second_layer_size)
        self.weights1 = np.random.rand(second_layer_size, 1)
    
    def activation_func(self, x):
        return 1/(1+np.exp(-x))
    
    def activation_func_derivative(self, x):
        return self.activation_func(x)*(1-self.activation_func(x))
        
    def predict(self, features):
        results0 = self.activation_func(features.dot(self.weights0))
        results1 = self.activation_func(results0.dot(self.weights1))
        return results1;

For this simple example, we will use mean squared error for our loss function:
$$MSE = \sum_{i=1}^{n}(y_i-\sigma(w_1(\sigma(w_0x_0+b_0))+b_1))^2$$

Ideally, for a logistic regression neural network classifier, the log loss would be the preferred loss function as it generates a convex curve while MSE does not. But for this simple example, we will proceed with using MSE as our loss function so the derivative for back propogation is easier to follow.

$$Log Loss = \sum_{i=1}^{n}(y_i\log(\sigma(w_1(\sigma(w_0x_0+b_0))+b_1)) + (1-y_i)\log(1-\sigma(w_1(\sigma(w_0x_0+b_0))+b_1)))$$

Gradient with respect to layer 1 for backpropagation:
$$\nabla(\boldsymbol{w_1}) = \sum_{i=1}^{n}(2/n)(\sigma(w_1x_1+b_1)-y_i)(\sigma'(w_1x_1+b_1))(x_1)$$

This formula can be mapped to the code below as follows:  
$$\texttt{get_errors(): }\sigma(w_1x_1+b_1)-y_i$$
$$\texttt{activation_func_derivative(results1): }\sigma'(w_1x_1+b_1)$$
$$\texttt{results0: }x_1$$

Gradient with respoect to layer 0 for backpropagation:
$$\nabla(\boldsymbol{w_0}) = \sum_{i=1}^{n}(2/n)(\sigma(w_1(\sigma(w_0x_0+b_0)+b_1))-y_i)\sigma'(w_1(\sigma(w_0x_0+b_0)+b_1))(w_1)\sigma'(w_0x_0+b_0)(x_0)$$

This formula can be mapped to the code below as follows:  
$$\texttt{get_errors(): }\sigma(w_1(\sigma(w_0x_0+b_0)+b_1))-y_i$$
$$\texttt{activation_func_derivative(results1): }\sigma'(w_1(\sigma(w_0x_0+b_0))+b_1)$$
$$\texttt{weights1: }w_1$$
$$\texttt{activation_func_derivative(results0): }\sigma'(w_0x_0+b_0)$$
$$\texttt{features: }x_0$$

Note, these formulas applicable for batch gradient descent because we are summing all data for each single step

In [3]:
class Trainer:
    
    def get_errors(self, labels, predictions):
        return labels - predictions
    
    def get_mean_squared_error(self, errors):
        return np.sum(np.square(errors))/errors.size 
        
    def feedforward(self, nn, features):
        results0 = nn.activation_func(features.dot(nn.weights0))
        results1 = nn.activation_func(results0.dot(nn.weights1))
        return results0, results1
    
    def backpropagate(self, nn, features, results0, results1, errors, learning_rate):
        weights1_delta = (2/errors.size)*results0.T.dot(errors*nn.activation_func_derivative(results1))
        weights0_delta = (2/errors.size)*features.T.dot(((errors*nn.activation_func_derivative(results1)).dot(nn.weights1.T))*(nn.activation_func_derivative(results0)))
                
        nn.weights1 += learning_rate*weights1_delta
        nn.weights0 += learning_rate*weights0_delta
        
    def train(self, nn, features, labels, learning_rate, epochs):
        for epoch in range(epochs):
            results0, results1 = self.feedforward(nn, features)
            errors = self.get_errors(labels, results1)
            mean_squared_errors = self.get_mean_squared_error(errors)
            print("At epoch:", epoch, ", MSE = ", mean_squared_errors) 
            self.backpropagate(nn, features, results0, results1, errors, learning_rate)

In [4]:
trainer = Trainer()

In [5]:
# Creating some simple training data just to check basic funcationality
simple_features = np.array([[-3,-3],
                            [-3,3],
                            [3,-3],
                            [3,3]])

simple_labels = np.array([[0], [1], [1], [1]])

In [6]:
simple_neural_network = SimpleNeuralNetwork(2, 10)
trainer.train(simple_neural_network, simple_features, simple_labels, 0.1, 100)

At epoch: 0 , MSE =  0.08950707966768853
At epoch: 1 , MSE =  0.08893248721317511
At epoch: 2 , MSE =  0.08837173800464301
At epoch: 3 , MSE =  0.08782447410746688
At epoch: 4 , MSE =  0.08729034617195305
At epoch: 5 , MSE =  0.08676901336611811
At epoch: 6 , MSE =  0.08626014329229638
At epoch: 7 , MSE =  0.08576341188962433
At epoch: 8 , MSE =  0.0852785033242708
At epoch: 9 , MSE =  0.08480510986911587
At epoch: 10 , MSE =  0.08434293177442705
At epoch: 11 , MSE =  0.08389167713093824
At epoch: 12 , MSE =  0.08345106172660387
At epoch: 13 , MSE =  0.08302080889817759
At epoch: 14 , MSE =  0.0826006493786532
At epoch: 15 , MSE =  0.0821903211414991
At epoch: 16 , MSE =  0.08178956924252333
At epoch: 17 , MSE =  0.08139814566011778
At epoch: 18 , MSE =  0.08101580913454948
At epoch: 19 , MSE =  0.08064232500689233
At epoch: 20 , MSE =  0.0802774650581273
At epoch: 21 , MSE =  0.07992100734887489
At epoch: 22 , MSE =  0.07957273606017011
At epoch: 23 , MSE =  0.07923244133563731
At epo

In [7]:
# Creating some simple training data just to check basic funcationality
simple_test_data = np.array([[-10,-10],
                             [-10,10],
                             [10,-10],
                             [10,10]])

simple_test_labels = np.array([[0], [1], [1], [1]])
predicted_values = simple_neural_network.predict(simple_test_data)
print("Predicted Values:", predicted_values)
errors = trainer.get_errors(simple_test_labels, predicted_values)
mean_squared_error = trainer.get_mean_squared_error(errors);
print("Mean Squared Error:", mean_squared_error)

Predicted Values: [[0.50000055]
 [0.93977879]
 [0.97550905]
 [0.99839379]]
Mean Squared Error: 0.0635573822502453


In [8]:
# Okay, now that it seems to be working, lets test with some more complex data
# Using data from here: http://archive.ics.uci.edu/ml/datasets/Abalone
df = pd.read_csv("abalone.csv")
df.head()

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [9]:
# Doing a bit of preprocessing
def string_to_binary(string):
    if string == "F":
        return 1
    else:
        return 0
    
df = df.loc[df['Type'] != "I"]
df['Type'] = df['Type'].apply(string_to_binary)
df.head()

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
6,1,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [10]:
# Splitting into training and testing data
training_data, testing_data = train_test_split(df, test_size=0.2)
training_features = training_data.iloc[:, ~training_data.columns.isin(['Type'])].values
training_labels = training_data['Type'].values.reshape(len(training_features),1)
testing_features = testing_data.iloc[:, ~testing_data.columns.isin(['Type'])].values
testing_labels = testing_data['Type'].values.reshape(len(testing_features),1)

In [11]:
# Training
classification_neural_network = SimpleNeuralNetwork(8, 10)
trainer.train(classification_neural_network, training_features, training_labels, 0.1, 100)

At epoch: 0 , MSE =  0.5302047682662545
At epoch: 1 , MSE =  0.5281679402731628
At epoch: 2 , MSE =  0.5256469818428043
At epoch: 3 , MSE =  0.5225267840368255
At epoch: 4 , MSE =  0.5186662318214962
At epoch: 5 , MSE =  0.5138944782711514
At epoch: 6 , MSE =  0.508009305027207
At epoch: 7 , MSE =  0.5007811515847652
At epoch: 8 , MSE =  0.4919697078769396
At epoch: 9 , MSE =  0.481362833810289
At epoch: 10 , MSE =  0.4688415166837805
At epoch: 11 , MSE =  0.45445124164223033
At epoch: 12 , MSE =  0.43844008507974636
At epoch: 13 , MSE =  0.42124018256781176
At epoch: 14 , MSE =  0.40340425098764787
At epoch: 15 , MSE =  0.38552482533083554
At epoch: 16 , MSE =  0.3681586024095284
At epoch: 17 , MSE =  0.3517688914837464
At epoch: 18 , MSE =  0.33669259159760095
At epoch: 19 , MSE =  0.32313227592503063
At epoch: 20 , MSE =  0.31116841628664405
At epoch: 21 , MSE =  0.300783633194931
At epoch: 22 , MSE =  0.2918909854767061
At epoch: 23 , MSE =  0.28436054026172025
At epoch: 24 , MSE =

In [12]:
# Checking performance on testing data
# Note: not expecting optimal performance here as a non optimal loss function is being used
predicted_values = classification_neural_network.predict(testing_features)
errors = trainer.get_errors(testing_labels, predicted_values)
print("Mean Squared Error:", trainer.get_mean_squared_error(errors))

Mean Squared Error: 0.24873244064258596
