In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Activation Function (sigmoid):
$$\sigma(x) = \frac{1} {1 + e^{-x}}$$

Derivative of Activation Function:
$$\sigma'(x) = \sigma'(x)(1-\sigma'(x))$$

In [14]:
class SimpleNeuralNetwork:
    
    # Simple 2 layer neural network
    # First layer has as a node for each feature 
    # Second layer has a configurable number of nodes that map to 1 output
    def __init__(self, num_of_features, second_layer_size):
        self.weights0 = np.random.rand(num_of_features, second_layer_size)
        self.weights1 = np.random.rand(second_layer_size, 1)
    
    def activation_func(self, x):
        return 1/(1+np.exp(-x))
    
    def activation_func_derivative(self, x):
        return self.activation_func(x)*(1-self.activation_func(x))
        
    def predict(self, features):
        results0 = self.activation_func(features.dot(self.weights0))
        results1 = self.activation_func(results0.dot(self.weights1))
        return results1;

For this simple example, we will use mean squared error for our loss function:
$$MSE = \sum_{i=1}^{n}(y_i-\sigma(w_1(\sigma(w_0x_0+b_0))+b_1))^2$$

Ideally, for a logistic regression neural network classifier, the log loss would be the preferred loss function as it generates a convex curve while MSE does not. But for this simple example, we will proceed with using MSE as our loss function so the derivative for back propogation is easier to follow.

$$Log Loss = \sum_{i=1}^{n}(y_i\log(\sigma(w_1(\sigma(w_0x_0+b_0))+b_1)) + (1-y_i)\log(1-\sigma(w_1(\sigma(w_0x_0+b_0))+b_1)))$$

Gradient with respect to layer 1 for backpropagation:
$$\nabla(\boldsymbol{w_1}) = \sum_{i=1}^{n}(2/n)(\sigma(w_1x_1+b_1)-y_i)(\sigma'(w_1x_1+b_1))(x_1)$$

This formula can be mapped to the code below as follows:  
$$\texttt{get_errors(): }\sigma(w_1x_1+b_1)-y_i$$
$$\texttt{activation_func_derivative(results1): }\sigma'(w_1x_1+b_1)$$
$$\texttt{results0: }x_1$$

Gradient with respoect to layer 0 for backpropagation:
$$\nabla(\boldsymbol{w_0}) = \sum_{i=1}^{n}(2/n)(\sigma(w_1(\sigma(w_0x_0+b_0)+b_1))-y_i)(\sigma'(w_1(\sigma(w_0x_0+b_0))+b_1))(w_1)\sigma'(w_0x_0+b_0)(x_0)$$

This formula can be mapped to the code below as follows:  
$$\texttt{get_errors(): }\sigma(w_1(\sigma(w_0x_0+b_0)+b_1))-y_i$$
$$\texttt{activation_func_derivative(results1): }\sigma'(w_1(\sigma(w_0x_0+b_0))+b_1)$$
$$\texttt{weights1: }w_1$$
$$\texttt{activation_func_derivative(results0): }\sigma'(w_0x_0+b_0)$$
$$\texttt{features: }x_0$$

Note, these formulas applicable for batch gradient descent because we are summing all data for each single step

In [25]:
class Trainer:
    
    def get_errors(self, labels, predictions):
        return labels - predictions
    
    def get_mean_squared_error(self, errors):
        return np.sum(np.square(errors))/errors.size 
        
    def feedforward(self, nn, features):
        results0 = nn.activation_func(features.dot(nn.weights0))
        results1 = nn.activation_func(results0.dot(nn.weights1))
        return results0, results1
    
    def backpropagate(self, nn, features, results0, results1, errors, learning_rate):
        weights1_delta = (2/errors.size)*results0.T.dot(errors*nn.activation_func_derivative(results1))
        weights0_delta = (2/errors.size)*features.T.dot(((errors*nn.activation_func_derivative(results1)).dot(nn.weights1.T))*(nn.activation_func_derivative(results0)))
                
        nn.weights1 += learning_rate*weights1_delta
        nn.weights0 += learning_rate*weights0_delta
        
    def train(self, nn, features, labels, learning_rate, epochs):
        for epoch in range(epochs):
            results0, results1 = self.feedforward(nn, features)
            errors = self.get_errors(labels, results1)
            mean_squared_errors = self.get_mean_squared_error(errors)
            print("At epoch:", epoch, ", MSE = ", mean_squared_errors) 
            self.backpropagate(nn, features, results0, results1, errors, learning_rate)

In [16]:
trainer = Trainer()

In [17]:
# Creating some simple training data just to check basic funcationality
simple_features = np.array([[-3,-3],
                            [-3,3],
                            [3,-3],
                            [3,3]])

simple_labels = np.array([[0], [1], [1], [1]])

In [18]:
simple_neural_network = SimpleNeuralNetwork(2, 10)
trainer.train(simple_neural_network, simple_features, simple_labels, 0.1, 100)

At epoch: 0 , MSE =  0.10764871872793315
At epoch: 1 , MSE =  0.10649986303883993
At epoch: 2 , MSE =  0.10538308561518002
At epoch: 3 , MSE =  0.1042975544797234
At epoch: 4 , MSE =  0.10324244293531996
At epoch: 5 , MSE =  0.10221693175146963
At epoch: 6 , MSE =  0.1012202110603651
At epoch: 7 , MSE =  0.10025148198794044
At epoch: 8 , MSE =  0.0993099580440855
At epoch: 9 , MSE =  0.09839486629475931
At epoch: 10 , MSE =  0.09750544833727087
At epoch: 11 , MSE =  0.09664096109853525
At epoch: 12 , MSE =  0.0958006774746701
At epoch: 13 , MSE =  0.09498388682889239
At epoch: 14 , MSE =  0.09418989536331865
At epoch: 15 , MSE =  0.09341802637897592
At epoch: 16 , MSE =  0.0926676204370991
At epoch: 17 , MSE =  0.0919380354336299
At epoch: 18 , MSE =  0.09122864659774091
At epoch: 19 , MSE =  0.09053884642419278
At epoch: 20 , MSE =  0.08986804454838719
At epoch: 21 , MSE =  0.08921566757210077
At epoch: 22 , MSE =  0.08858115884708088
At epoch: 23 , MSE =  0.08796397822294
At epoch: 2

In [19]:
# Creating some simple training data just to check basic funcationality
simple_test_data = np.array([[-10,-10],
                             [-10,10],
                             [10,-10],
                             [10,10]])

simple_test_labels = np.array([[0], [1], [1], [1]])
predicted_values = simple_neural_network.predict(simple_test_data)
print("Predicted Values:", predicted_values)
errors = trainer.get_errors(simple_test_labels, predicted_values)
mean_squared_error = trainer.get_mean_squared_error(errors);
print("Mean Squared Error:", mean_squared_error)

Predicted Values: [[0.50002194]
 [0.97028035]
 [0.96827613]
 [0.99899738]]
Mean Squared Error: 0.06297815072257777


In [20]:
# Okay, now that it seems to be working, lets test with some more complex data
# Using data from here: http://archive.ics.uci.edu/ml/datasets/Abalone
df = pd.read_csv("abalone.csv")
df.head()

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [21]:
# Doing a bit of preprocessing
def string_to_binary(string):
    if string == "F":
        return 1
    else:
        return 0
    
df = df.loc[df['Type'] != "I"]
df['Type'] = df['Type'].apply(string_to_binary)
df.head()

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
6,1,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [22]:
# Splitting into training and testing data
training_data, testing_data = train_test_split(df, test_size=0.2)
training_features = training_data.iloc[:, ~training_data.columns.isin(['Type'])].values
training_labels = training_data['Type'].values.reshape(len(training_features),1)
testing_features = testing_data.iloc[:, ~testing_data.columns.isin(['Type'])].values
testing_labels = testing_data['Type'].values.reshape(len(testing_features),1)

In [23]:
# Training
classification_neural_network = SimpleNeuralNetwork(8, 10)
trainer.train(classification_neural_network, training_features, training_labels, 0.1, 1000)

At epoch: 0 , MSE =  0.5326211159994324
At epoch: 1 , MSE =  0.5315523366068559
At epoch: 2 , MSE =  0.530210379070474
At epoch: 3 , MSE =  0.5285162426706952
At epoch: 4 , MSE =  0.5263633920560752
At epoch: 5 , MSE =  0.5236070191756433
At epoch: 6 , MSE =  0.5200505909128662
At epoch: 7 , MSE =  0.5154330864160919
At epoch: 8 , MSE =  0.5094290741554558
At epoch: 9 , MSE =  0.5016867655523124
At epoch: 10 , MSE =  0.49191991797553763
At epoch: 11 , MSE =  0.4800091011613985
At epoch: 12 , MSE =  0.46603644245001796
At epoch: 13 , MSE =  0.45025620801518185
At epoch: 14 , MSE =  0.43305498530569614
At epoch: 15 , MSE =  0.4149147883139391
At epoch: 16 , MSE =  0.39637329177985875
At epoch: 17 , MSE =  0.377980794071867
At epoch: 18 , MSE =  0.3602554667722938
At epoch: 19 , MSE =  0.34364044577565656
At epoch: 20 , MSE =  0.3284696710508029
At epoch: 21 , MSE =  0.3149499020745726
At epoch: 22 , MSE =  0.3031621615670262
At epoch: 23 , MSE =  0.2930797902719215
At epoch: 24 , MSE =  

At epoch: 239 , MSE =  0.24800126622414462
At epoch: 240 , MSE =  0.24800098228383766
At epoch: 241 , MSE =  0.2480007073207041
At epoch: 242 , MSE =  0.24800044134171254
At epoch: 243 , MSE =  0.2480001843537266
At epoch: 244 , MSE =  0.24799993636350567
At epoch: 245 , MSE =  0.2479996973777071
At epoch: 246 , MSE =  0.2479994674028875
At epoch: 247 , MSE =  0.24799924644550395
At epoch: 248 , MSE =  0.24799903451191574
At epoch: 249 , MSE =  0.2479988316083857
At epoch: 250 , MSE =  0.2479986377410816
At epoch: 251 , MSE =  0.24799845291607756
At epoch: 252 , MSE =  0.2479982771393552
At epoch: 253 , MSE =  0.24799811041680528
At epoch: 254 , MSE =  0.2479979527542285
At epoch: 255 , MSE =  0.24799780415733724
At epoch: 256 , MSE =  0.24799766463175654
At epoch: 257 , MSE =  0.2479975341830251
At epoch: 258 , MSE =  0.24799741281659693
At epoch: 259 , MSE =  0.24799730053784186
At epoch: 260 , MSE =  0.24799719735204706
At epoch: 261 , MSE =  0.24799710326441796
At epoch: 262 , MSE 

At epoch: 475 , MSE =  0.24818400727481232
At epoch: 476 , MSE =  0.2481857236246997
At epoch: 477 , MSE =  0.2481874445070466
At epoch: 478 , MSE =  0.24818916982368186
At epoch: 479 , MSE =  0.24819089947495512
At epoch: 480 , MSE =  0.24819263335972366
At epoch: 481 , MSE =  0.2481943713753407
At epoch: 482 , MSE =  0.24819611341764275
At epoch: 483 , MSE =  0.24819785938093764
At epoch: 484 , MSE =  0.24819960915799227
At epoch: 485 , MSE =  0.24820136264002104
At epoch: 486 , MSE =  0.24820311971667386
At epoch: 487 , MSE =  0.24820488027602441
At epoch: 488 , MSE =  0.24820664420455874
At epoch: 489 , MSE =  0.2482084113871637
At epoch: 490 , MSE =  0.24821018170711598
At epoch: 491 , MSE =  0.24821195504607071
At epoch: 492 , MSE =  0.24821373128405047
At epoch: 493 , MSE =  0.24821551029943517
At epoch: 494 , MSE =  0.24821729196895043
At epoch: 495 , MSE =  0.24821907616765815
At epoch: 496 , MSE =  0.24822086276894562
At epoch: 497 , MSE =  0.24822265164451582
At epoch: 498 ,

At epoch: 715 , MSE =  0.24831619180203854
At epoch: 716 , MSE =  0.24831375168762465
At epoch: 717 , MSE =  0.2483112824614363
At epoch: 718 , MSE =  0.2483087843618
At epoch: 719 , MSE =  0.24830625763251185
At epoch: 720 , MSE =  0.24830370252276338
At epoch: 721 , MSE =  0.24830111928706466
At epoch: 722 , MSE =  0.24829850818516502
At epoch: 723 , MSE =  0.2482958694819715
At epoch: 724 , MSE =  0.24829320344746447
At epoch: 725 , MSE =  0.24829051035661134
At epoch: 726 , MSE =  0.248287790489278
At epoch: 727 , MSE =  0.2482850441301377
At epoch: 728 , MSE =  0.24828227156857804
At epoch: 729 , MSE =  0.24827947309860599
At epoch: 730 , MSE =  0.24827664901875054
At epoch: 731 , MSE =  0.24827379963196355
At epoch: 732 , MSE =  0.2482709252455191
At epoch: 733 , MSE =  0.24826802617091046
At epoch: 734 , MSE =  0.24826510272374558
At epoch: 735 , MSE =  0.248262155223641
At epoch: 736 , MSE =  0.2482591839941141
At epoch: 737 , MSE =  0.24825618936247376
At epoch: 738 , MSE =  0

At epoch: 964 , MSE =  0.24757761193906916
At epoch: 965 , MSE =  0.24757614394251562
At epoch: 966 , MSE =  0.2475746919348853
At epoch: 967 , MSE =  0.2475732558620157
At epoch: 968 , MSE =  0.24757183566931756
At epoch: 969 , MSE =  0.24757043130179288
At epoch: 970 , MSE =  0.24756904270405292
At epoch: 971 , MSE =  0.24756766982033543
At epoch: 972 , MSE =  0.24756631259452147
At epoch: 973 , MSE =  0.24756497097015254
At epoch: 974 , MSE =  0.24756364489044672
At epoch: 975 , MSE =  0.24756233429831528
At epoch: 976 , MSE =  0.24756103913637809
At epoch: 977 , MSE =  0.24755975934697955
At epoch: 978 , MSE =  0.24755849487220385
At epoch: 979 , MSE =  0.24755724565388987
At epoch: 980 , MSE =  0.24755601163364596
At epoch: 981 , MSE =  0.24755479275286452
At epoch: 982 , MSE =  0.24755358895273585
At epoch: 983 , MSE =  0.24755240017426244
At epoch: 984 , MSE =  0.24755122635827237
At epoch: 985 , MSE =  0.24755006744543265
At epoch: 986 , MSE =  0.24754892337626214
At epoch: 987

In [24]:
# Checking performance on testing data
# Note: not expecting optimal performance here as a non optimal loss function is being used
predicted_values = classification_neural_network.predict(testing_features)
errors = trainer.get_errors(testing_labels, predicted_values)
print("Mean Squared Error:", trainer.get_mean_squared_error(errors))

Mean Squared Error: 0.24773597265294642
