In [2]:
import numpy as np

In [3]:
# Layer 1
"""
Input = (1x2), weights is (2x2), output of first layer is (1x2). We add a bias to this, same shape
(1x2). 
"""
X = np.array([[2,4]]) # input
W1 = np.array([[1, 2], [3, 4]]) #2x2
b1 = np.array([[1,1]]) # 1x2



# Layer 2
"""Input is the output of layer 1, therefore we have an input of (1x2). We have weights (2x1), output is thus 
(1x1), to which we add a bias, shape (1x1).
"""


# Define the activated output of layer 1 as the input of layer 2
# a = np.array([[a1, a2]])


W2 = np.array([[2], [1]])
b2 = np.array([[1]])

# Print the shapes
print(W1.shape, b1.shape, W2.shape, b2.shape)

(2, 2) (1, 2) (2, 1) (1, 1)


In [4]:
# Randomize the weights to have random values between -1 and 1, if using tanh()
# np.random.rand gives values between 0 and 1, thus by shifting we get -1 to 1.
W1 = 2*np.random.rand(2,2)-1

# For sigmoid have values between 0 and 1
W1 = np.random.rand(2,2)
W2 = np.random.rand(2,1)

In [5]:
# to forward information from layer 1 to layer 2
"""sigmoidf(x dot W1 + b1)"""
def sigmoidf(x):
    """Sigmoid always returns a value between 0 and 1"""
    return (1 + np.exp(-x)) ** -1

# output layer 1:
a1 = sigmoidf(np.dot(X, W1)+b1)

In [6]:
a1

array([[0.98128568, 0.99572291]])

In [7]:
# second layer
y = sigmoidf(np.dot(a1, W2)+b2)
y

array([[0.92766109]])

In [8]:
# Define network
def XOR_net(X, W1, W2, b1, b2):    
    a1 = sigmoidf(np.dot(X, W1)+b1)
    a2 = sigmoidf(np.dot(a1, W2)+b2)
    return a2

In [9]:
# Garbage output - no training yet
XOR_net(np.array([[0,0]]), np.random.rand(2,2), np.random.rand(2,1), np.array([1,1]), np.array([[1]]))

array([[0.73979027]])

In [10]:
def random_weights(depth):
    """Create 'weights' per non-input node, first 2 are random, last is bias of value 1"""
    weights = np.random.rand(1,2)
    weights = np.append(weights, 1)
    for i in range(depth-1):
        weights = np.append(weights, np.random.rand(1,2))
        weights = np.append(weights, 1)
    return weights

# test
random_weights(3)

# to get shape 2x3
random_weights(3)[:6].reshape(2,3)

array([[0.57850845, 0.21806654, 1.        ],
       [0.28870781, 0.05477251, 1.        ]])

In [11]:
# Using what is asked in the exercise
def XOR_net_V2(inputs, weights):   
    """
    Inputs: array 
    weights: (1x9) matrix, the non-input nodes each have 3 weights, 2 incoming, and a bias.
    weights is created in such a way that the first 2 are the (random) weights and the last is the bias
    of node 1, 2 or final.
    """
    # Output of first layer, activated
    a1 = sigmoidf(np.dot(inputs, weights[:6].reshape(2,3)))
    # Output of second layer, activated
    a2 = sigmoidf(np.dot(a1, weights[6:].reshape(3,1)))
    return a2.reshape((1))

In [12]:
# Example
XOR_net_V2(np.array([[1,0]]), random_weights(3))

array([0.74288528])

In [13]:
# Implement the error function - use the mean squared error
def mse(weights, inputs, outputs, net=XOR_net_V2):
    # Calculate the mean squared error over all input and outputs from our network, we divide by
    # the length of the outputs to obtain the mean over all squared errors.
    return sum([(net(inpt, weights) - output)** 2 for inpt, output in zip(inputs, outputs)])/len(outputs)

In [105]:
inputs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
outputs = np.array([0, 1, 1, 0])

# Example
mse(random_weights(3), inputs, outputs, net=XOR_net_V2)[0]

0.36323202286885903

In [95]:
# Implement the gradient of the mse function, grdmse
def grdmse(init_weights, inputs, outputs, net, eps):
    """Note: this is a lazy gradient, it is not really a true mathematical derivation"""
    # Cost at start of iteration
    weight = init_weights
    s_mse = mse(weight, inputs, outputs, net=XOR_net_V2)
    par_derivs = np.zeros((9))
    
    # Now wiggle the weights
    for j, i in enumerate(weight):
        
        # Wiggle weight
        save = i 
        weight[j] += eps
        
        # Compute difference and divide by eps
        par_derivs[j] = (mse(weight, inputs, outputs, net=XOR_net_V2) - s_mse)/eps
        weight[j] = save
        
        
    #print(mse(weight, inputs, outputs, net=XOR_net_V2))
    return par_derivs


In [104]:
def learn(weights, grd, rate):    
    for j,i in enumerate(weights):
        weights[j] -= rate*grd[j]
    return weights

In [106]:
# Let's do a test run for some random weights
weights = random_weights(3)
#print(weights)
mse1 = mse(weights, inputs, outputs)
print("Before:", mse1)
grd1 = grdmse(weights, inputs, outputs, XOR_net_V2, 0.1)
#print(grd1)
learn(weights, grd1, 0.1)
print("After:", mse(learn(weights, grd1, 0.1), inputs, outputs))

Before: [0.28449469]
After: [0.28287054]


In [108]:
# Initialize random_weights
weights = random_weights(3)
print("Weights before:", weights)

# Simulate
for i in range(1,100000+1):
    mse1 = mse(weights, inputs, outputs)
    if i == 1:
        print("\nInital mse:", mse1)
    grd1 = grdmse(weights, inputs, outputs, XOR_net_V2, 0.1)
    weights = learn(weights, grd1, 0.1)
    if i % 10000 == 0:
        print(f"Mse after step {i}:", mse(weights, inputs, outputs))
print("\nWeights after:", weights)

Weights before: [0.75224896 0.0487717  1.         0.88764052 0.94369309 1.
 0.65188074 0.79927806 1.        ]

Inital mse: [0.35636847]
Mse after step 10000: [0.02794189]
Mse after step 20000: [0.00713357]
Mse after step 30000: [0.00404196]
Mse after step 40000: [0.00281074]
Mse after step 50000: [0.00215083]
Mse after step 60000: [0.00174008]
Mse after step 70000: [0.00146008]
Mse after step 80000: [0.0012571]
Mse after step 90000: [0.00110328]
Mse after step 100000: [0.00098274]

Weights after: [ 6.32500468 -3.97699851  5.34692451 -3.95681898  6.38394552  5.38434373
 -9.82625934 -9.82429421 13.53159104]


In [109]:
# Validation of model: 
for i in inputs:
    print("Input:", i, "prediction:", XOR_net_V2(i, weights))

Input: [0 0] prediction: [0.0448099]
Input: [0 1] prediction: [0.97004568]
Input: [1 0] prediction: [0.97005582]
Input: [1 1] prediction: [0.01136371]
