https://iamtrask.github.io/2015/07/12/basic-python-network/

In [1]:
import numpy as np

# Sigmoid function
def nonlin(x,deriv=False):
    if (deriv):
        return x*(1-x)
    return 1/(1+np.exp(-x))

# Input rows. 
X = np.array([[0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1]])

In [2]:
# Outputs. Happens to be first feature.
y = np.array([[0,0,1,1]]).T

# initialize weights randomly with mean 0
np.random.seed(42)
syn0 = 2*np.random.random((3,1)) - 1

for iter in range(10000):
    
    # forward propagation
    l0 = X
    l1 = nonlin(np.dot(l0,syn0)) # dot = sum product in numpy
    
    # how much did we miss?
    l1_error = y - l1
    
    # multiply how much we missed by the
    # slope of the sigmoid at the values in l1
    l1_delta = l1_error * nonlin(l1,True)
    
    # update weights
    syn0 += np.dot(l0.T,l1_delta) 
l1

array([[0.00966808],
       [0.00786589],
       [0.99358863],
       [0.99211705]])

In [3]:
syn0

array([[ 9.67256303],
       [-0.20811174],
       [-4.62926144]])

Shouldn't `syn0[1]` be much smaller as output can be completly predicted from `x[0]`?

In [4]:
# Output now x_0 XOR x_1
# Need 3 layers to learn this pattern
y = np.array([[0,1,1,0]]).T


def nnet(hidden_layer_size=3, epochs=60000, seed=42, log_errors=True):

    # Three layer network with 3, 4 and 1 neurons.
    # randomly initialize our weights with mean 0

    np.random.seed(seed)
    syn0 = 2*np.random.random((3,hidden_layer_size)) - 1
    syn1 = 2*np.random.random((hidden_layer_size,1)) - 1

    for j in range(epochs):

        # Feed forward through layers 0, 1, and 2
        l0 = X
        l1 = nonlin(np.dot(l0,syn0))
        l2 = nonlin(np.dot(l1,syn1))

        # how much did we miss the target value?
        l2_error = y - l2
        l2_error_mean = np.mean(np.abs(l2_error))

        if (log_errors and (j%10000 == 0)):
            print("Error: %f" % l2_error_mean)

        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        l2_delta = l2_error*nonlin(l2,deriv=True)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        l1_error = l2_delta.dot(syn1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        l1_delta = l1_error * nonlin(l1,deriv=True)

        syn1 += l1.T.dot(l2_delta)
        syn0 += l0.T.dot(l1_delta)
    return l2_error_mean, syn0, syn1

3 hidden neurons are required to model the XOR. Two for capturing the `ÒR` and one more for the `X`.

In [5]:
_, syn0, syn1 = nnet()
syn0

Error: 0.497227
Error: 0.010790
Error: 0.007464
Error: 0.006036
Error: 0.005196
Error: 0.004629


array([[ 2.42350241,  7.46417695,  5.72198525],
       [ 2.90091623, -6.12413245, -7.11689821],
       [ 0.73820995,  2.71294886, -2.69521821]])

In [6]:
syn1

array([[  5.81169952],
       [-11.41548566],
       [ 11.87697828]])

In [7]:
for s in range (1,5):
    print(nnet(hidden_layer_size=s, log_errors=False)[0])

0.4584878090167257
0.01056136294393234
0.004212792374307104
0.004165398133066024
