In [1]:
import numpy as np

In [2]:
class NeuralNetwork:
    def __init__(self, layers, alpha=0.1):
        # Initialize the list of weights matrices, then store the
        # network architecture and learning rate
        self.W = []
        self.layers = layers
        self.alpha = alpha
        
        # Start looping from the index of the first layer but
        # stop before we reach teh last two layers
        for i in np.arange(0, len(layers)-2):
            # Randomly initialize a weight matrix conneting the 
            # number of nodes in each respective layer together
            # adding a extra node for the bias
            w = np.random.randn(layers[i]+1, layers[i+1]+1)
            self.W.append(w/np.sqrt(layers[i]))
            
        # The last two layers are a special case where the input
        # connections need a bias term but the output does not
        w = np.random.randn(layers[-2]+1, layers[-1])
        self.W.append(w/np.sqrt(layers[-2]))
        
    def __repr__(self):
        # Construct and return a string that represents the network architecture
        return "NeuralNetwork: {}".format(
            "-".join(str(l) for l in self.layers))
    
    def sigmoid(self, x):
        # Compute and return the sigmoid activation value for a given input value
        return 1.0/(1 + np.exp(-x))
    
    def sigmoid_deriv(self, x):
        # Compute the derivative of the sigmoid function assuming that `x` has 
        # already been passed through the `sigmoid` function
        return x * (1 - x)
    
    def fit(self, X, y, epochs=1000, displayUpdate=100):
        # Insert a column of 1s as the last entry in the feature matrix -- this 
        # little trick alloows us to treat the bias as a trainable parameter within 
        # the weight matrix
        X = np.c_[X, np.ones((X.shape[0]))]
        
        # loop over the desired number of epochs 
        for epoch in np.arange(0, epochs):
            # Loop over each individual data point and train our network on it
            for (x, target) in zip(X, y):
                self.fit_partial(x, target)
                
            # Check to see if we should display a training update
            if epoch == 0 or (epoch + 1) % displayUpdate == 0:
                loss = self.calculate_loss(X, y)
                print("[INFO] epoch={}, loss={:.7f}".format(
                    epoch + 1, loss))
                
    def fit_partial(self, x, y):
        # Construct our list of output activations for each layer as our data point
        # flows through the network; the first activation is a special case -- its 
        # just the input feature vector itself
        A = [np.atleast_2d(x)]
        
        # FeedForward:
        # Loop over the layers in the network
        for layer in np.arange(0, len(self.W)):
            # Feedforward the activation at the current layer by takeing the dot product
            # between the activation and the weight matrix -- this is called the "net 
            # input" to the current layer
            net = A[layer].dot(self.W[layer])
            
            # Computing the "net output" is simply applying our non-linear activation 
            # function to the net input
            out = self.sigmoid(net)
            
            # Once we have the net output, add it to our list of activations
            A.append(out)
            
        # Backpropagation
        """
        The first phase of the BP is to compute the differences between our *prediction*
        (the final ouput activation in the activations list) and the true target value
        """
        error = A[-1] - y
        
        """
        From here, we need to apply the chain rule and build our list of deltas 'D'; the 
        first entry in the deltas is simply the error of the output layer times the derivative
        of our activation function for the ouput value
        """
        D = [error * self.sigmoid_deriv(A[-1])]
        
        """
        Once you understand the chain rule ut becomes super easy to implement with a `for` loop
        Simply loop over the layers in reverse order (ignoring the last two since we already have
        taken them into account)
        """
        for layer in np.arange(len(A) - 2, 0, -1):
            """
            The delta for the current layer is equal to the delta of the *previous layer* dotted 
            with the weight matrix of the current layer, followed by multiplying the delta by the
            derivative of the non-linear activation function for the activations of the current layer
            """
            delta = D[-1].dot(self.W[layer].T)
            delta = delta * self.sigmoid_deriv(A[layer])
            D.append(delta)
            
        # Since we looped over our layers in reverse order we need to reverse the deltas
        D = D[::-1]
        
        # Weight Update Phase
        # Loop over the layers
        for layer in np.arange(0, len(self.W)):
            # Update our weights by taking the dot product of the layer activations with 
            # their respective deltas, then multiplying this value by some small learning
            # rate and adding to our weight matrix. -- this is where the actual "learning"
            # takes place
            self.W[layer] += -self.alpha * A[layer].T.dot(D[layer])
            
    def predict(self, X, addBias=True):
        # Initialize the output prediction as the input features -- this value will be
        # forward propagated through the network to obtain the final prediction
        p = np.atleast_2d(X)
        
        # Check to see if the bias column should be added
        if addBias:
            # Insert a column of 1s as the last entry in the feature matrix(bias)
            p = np.c_[p, np.ones((p.shape[0]))]
            
            
        # Loop over our layers in the network
        for layer in np.arange(0, len(self.W)):
            # Computing the ouput prediction is as simple as taking the dot product between
            # the current activation value `p` and the weight matrix associated with the 
            # current layer, then passing this value through a non-linear activation function
            p = self.sigmoid(np.dot(p, self.W[layer]))
            
        # Return the predicted value 
        return p
    
    def calculate_loss(self, X, targets):
        # Make predictions for the input data points then compute the loss
        targets = np.atleast_2d(targets)
        predictions = self.predict(X, addBias=False)
        loss = 0.5 * np.sum((predictions - targets) ** 2)
        
        # return the loss
        return loss
        
        

In [3]:
[i for i in np.arange(0, 3-2)]
w = np.random.randn(2 + 1, 2 + 1)
w

array([[ 1.17879955, -1.77508576,  0.42450692],
       [-0.91843769, -1.14079721,  2.28744076],
       [-1.26140933,  0.35172004,  1.49340923]])

In [6]:
nn = NeuralNetwork([2, 2, 2, 2, 1], alpha=0.5)

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

nn.fit(X, y, epochs=20000)

[INFO] epoch=1, loss=0.5040686
[INFO] epoch=100, loss=0.5004561
[INFO] epoch=200, loss=0.5003506
[INFO] epoch=300, loss=0.5002659
[INFO] epoch=400, loss=0.5001971
[INFO] epoch=500, loss=0.5001408
[INFO] epoch=600, loss=0.5000946
[INFO] epoch=700, loss=0.5000568
[INFO] epoch=800, loss=0.5000259
[INFO] epoch=900, loss=0.5000009
[INFO] epoch=1000, loss=0.4999808
[INFO] epoch=1100, loss=0.4999647
[INFO] epoch=1200, loss=0.4999519
[INFO] epoch=1300, loss=0.4999418
[INFO] epoch=1400, loss=0.4999339
[INFO] epoch=1500, loss=0.4999277
[INFO] epoch=1600, loss=0.4999227
[INFO] epoch=1700, loss=0.4999187
[INFO] epoch=1800, loss=0.4999153
[INFO] epoch=1900, loss=0.4999124
[INFO] epoch=2000, loss=0.4999098
[INFO] epoch=2100, loss=0.4999073
[INFO] epoch=2200, loss=0.4999049
[INFO] epoch=2300, loss=0.4999024
[INFO] epoch=2400, loss=0.4998997
[INFO] epoch=2500, loss=0.4998969
[INFO] epoch=2600, loss=0.4998939
[INFO] epoch=2700, loss=0.4998906
[INFO] epoch=2800, loss=0.4998870
[INFO] epoch=2900, loss=0.

In [5]:
for(x, target) in zip(X, y):
    pred = nn.predict(x)[0][0]
    step = 1 if pred > 0.5 else 0
    print("[INFO] data={}, ground-truth={}, pred={:.4f}, step={}".format(
        x, target[0], pred, step))

[INFO] data=[0 0], ground-truth=0, pred=0.0076, step=0
[INFO] data=[0 1], ground-truth=1, pred=0.9872, step=1
[INFO] data=[1 0], ground-truth=1, pred=0.9898, step=1
[INFO] data=[1 1], ground-truth=0, pred=0.0127, step=0
