In [1]:
import autograd.numpy as np  # We need to use this numpy wrapper to make automatic differentiation work later
from autograd import grad, elementwise_grad
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# Defining some activation functions
def ReLU(z):
    return np.where(z > 0, z, 0)

# Derivative of the ReLU function
def ReLU_der(z):
    return np.diag(np.where(z > 0, 1, 0))

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def mse(predict, target):
    return np.mean((predict - target) ** 2)

#### Exercise 2a)

The shape of weights and biases will be...

In [2]:
# Exercise 2b)
def feed_forward_one_layer(W, b, x):
    z = W @ x + b
    a = sigmoid(z)
    return a

def cost_one_layer(W, b, x, target):
    predict = feed_forward_one_layer(W, b, x)
    return mse(predict, target)


x = np.random.rand(2)
target = np.random.rand(3)

W = np.random.randn(len(target), len(x))
b = np.random.randn(len(target))

In [3]:
# Exercise 2c)
autograd_one_layer = grad(cost_one_layer, [0, 1])
W_g, b_g = autograd_one_layer(W, b, x, target)
print(W_g, b_g)

[[-0.00699902 -0.01090799]
 [-0.01907937 -0.02973526]
 [ 0.00934513  0.01456442]] [-0.0167164  -0.04556904  0.02231985]


#### Exercise 3a)

The reusable results are dC/da and da/dz.

In [4]:
# Exercise 3b)
z = W @ x + b
a = sigmoid(z)

predict = a

def mse_der(predict, target):
    return 2/len(predict) * (predict - target).T

print(mse_der(predict, target))

cost_autograd = grad(mse, 0)
print(cost_autograd(predict, target))

[-0.22523209 -0.18437859  0.09324845]
[-0.22523209 -0.18437859  0.09324845]


In [5]:
# Exercise 3c)
def sigmoid_der(z):
    return np.diag(np.exp(-z) / (1 + np.exp(-z))**2)

print(sigmoid_der(z))

sigmoid_autograd = elementwise_grad(sigmoid, 0)
print(sigmoid_autograd(z))

[[0.07421857 0.         0.        ]
 [0.         0.24714928 0.        ]
 [0.         0.         0.23935892]]
[0.07421857 0.24714928 0.23935892]


In [6]:
# Exercise 3d) 
dC_da = mse_der(a, target)
dC_dz = dC_da @ sigmoid_der(z)

print(dC_da.shape, dC_dz.shape)
print(sigmoid_der(z).shape)

(3,) (3,)
(3, 3)


In [7]:
# Exercise 3e)
dz_dW = np.tensordot(np.eye(len(target)), x, axes=0)
dz_db = np.ones(len(b))

In [8]:
# Exercise 3f)
dC_da = mse_der(a, target)
dC_dz = dC_da @ sigmoid_der(z)
dC_dW = dC_dz @ dz_dW
dC_db = dC_dz * dz_db

print(dC_dW, dC_db)

[[-0.00699902 -0.01090799]
 [-0.01907937 -0.02973526]
 [ 0.00934513  0.01456442]] [-0.0167164  -0.04556904  0.02231985]


In [9]:
W_g, b_g = autograd_one_layer(W, b, x, target)
print(W_g, b_g)

[[-0.00699902 -0.01090799]
 [-0.01907937 -0.02973526]
 [ 0.00934513  0.01456442]] [-0.0167164  -0.04556904  0.02231985]


In [10]:
x = np.random.rand(2)
target = np.random.rand(4)

W1 = np.random.rand(3, 2)
b1 = np.random.rand(3)

W2 = np.random.rand(4, 3)
b2 = np.random.rand(4)

layers = [(W1, b1), (W2, b2)]

z1 = W1 @ x + b1
a1 = sigmoid(z1)
z2 = W2 @ a1 + b2
a2 = sigmoid(z2)

In [11]:
# Exercise 4a)
dC_da2 = mse_der(a2, target) # OK
dC_dz2 = dC_da2 @ sigmoid_der(z2) # check vector as exponent
dC_dW2 = dC_dz2 @ np.tensordot(np.eye(len(z2)), a1, axes=0)
dC_db2 = dC_dz2 # deriv wrt b2 is 1

#### Exercise 4b)

The derivative of the second layer intermediate z2 wrt. the first layer activation a1 is a row vector where each entry is the sum of the corresponding row in the matrix.

In [12]:
# Exercise 4c)
dC_da1 = dC_dz2 @ W2 # OK
dC_dz1 = dC_da1 @ sigmoid_der(z1) # check vector as exponent
dC_dW1 = dC_dz1 @ np.tensordot(np.eye(len(z1)), x, axes=0) # OK
dC_db1 = dC_dz1 # deriv wrt b1 is 1

print(dC_dW1, dC_db1)
print(dC_dW2, dC_db2)

[[5.18719449e-04 1.15675628e-04]
 [3.44727935e-04 7.68751211e-05]
 [4.28196777e-04 9.54888646e-05]] [0.0203002  0.013491   0.01675757]
[[0.0112937  0.01193451 0.01380256]
 [0.02833034 0.0299378  0.03462383]
 [0.03609972 0.03814802 0.04411916]
 [0.01116046 0.01179371 0.01363972]] [0.02034986 0.05104778 0.06504725 0.02010978]


In [13]:
# Exercise 4d)
def feed_forward_two_layers(layers, x):
    W1, b1 = layers[0]
    z1 = W1 @ x + b1
    a1 = sigmoid(z1)

    W2, b2 = layers[1]
    z2 = W2 @ a1 + b2
    a2 = sigmoid(z2)

    return a2

def cost_two_layers(layers, x, target):
    predict = feed_forward_two_layers(layers, x)
    return mse(predict, target)


grad_two_layers = grad(cost_two_layers, 0)
grad_two_layers(layers, x, target)

[(array([[5.18719449e-04, 1.15675628e-04],
         [3.44727935e-04, 7.68751211e-05],
         [4.28196777e-04, 9.54888646e-05]]),
  array([0.0203002 , 0.013491  , 0.01675757])),
 (array([[0.0112937 , 0.01193451, 0.01380256],
         [0.02833034, 0.0299378 , 0.03462383],
         [0.03609972, 0.03814802, 0.04411916],
         [0.01116046, 0.01179371, 0.01363972]]),
  array([0.02034986, 0.05104778, 0.06504725, 0.02010978]))]

#### Exercise 4e)
The first derivative (the cost function) will be used one time on the outer layer. On the layer in question, we differentiate wrt W or b, but for intermediate layers we differentiate the activation functions and application of weight and bias over and over, until we reach the layer we are interested in.

In [14]:
def create_layers(network_input_size, layer_output_sizes):
    layers = []

    i_size = network_input_size
    for layer_output_size in layer_output_sizes:
        W = np.random.randn(layer_output_size, i_size)
        b = np.random.randn(layer_output_size)
        layers.append((W, b))

        i_size = layer_output_size
    return layers


def feed_forward(input, layers, activation_funcs):
    a = input
    for (W, b), activation_func in zip(layers, activation_funcs):
        z = W @ a + b
        a = activation_func(z)
    return a

def cost(layers, input, activation_funcs, target):
    predict = feed_forward(input, layers, activation_funcs)
    return mse(predict, target)

def feed_forward_saver(input, layers, activation_funcs):
    layer_inputs = []
    zs = []
    a = input
    for (W, b), activation_func in zip(layers, activation_funcs):
        layer_inputs.append(a)
        z = W @ a + b
        a = activation_func(z)

        zs.append(z)

    return layer_inputs, zs, a

In [15]:
# Exercise 5a)
def backpropagation(
    input, layers, activation_funcs, target, activation_ders, cost_der=mse_der
):
    layer_inputs, zs, predict = feed_forward_saver(input, layers, activation_funcs)

    layer_grads = [() for layer in layers]

    # We loop over the layers, from the last to the first
    for i in reversed(range(len(layers))):
        layer_input, z, activation_der = layer_inputs[i], zs[i], activation_ders[i]

        if i == len(layers) - 1:
            # For last layer we use cost derivative as dC_da(L) can be computed directly
            dC_da = cost_der(predict, target)
        else:
            # For other layers we build on previous z derivative, as dC_da(i) = dC_dz(i+1) * dz(i+1)_da(i)
            (W, b) = layers[i + 1]
            dC_da = dC_dz @ W

        dC_dz = dC_da @ activation_der(z)
        dC_dW = dC_dz @ np.tensordot(np.eye(len(z)), layer_input, axes=0)
        dC_db = dC_dz # deriv wrt b is 1

        layer_grads[i] = (dC_dW, dC_db)

    return layer_grads

In [16]:
network_input_size = 2
layer_output_sizes = [3, 4]
activation_funcs = [sigmoid, ReLU]
activation_ders = [sigmoid_der, ReLU_der]

layers = create_layers(network_input_size, layer_output_sizes)

x = np.random.rand(network_input_size)
target = np.random.rand(4)

layer_grads = backpropagation(x, layers, activation_funcs, target, activation_ders)
print(layer_grads)

cost_grad = grad(cost, 0)
cost_grad(layers, x, [sigmoid, ReLU], target)

[(array([[-0.00365208, -0.05748956],
       [ 0.0144103 ,  0.22684106],
       [ 0.02711306,  0.42680257]]), array([-0.06113624,  0.24123002,  0.45387548])), (array([[0.09984452, 0.29811681, 0.30916917],
       [0.16402653, 0.4897521 , 0.50790913],
       [0.22711308, 0.67811658, 0.70325702],
       [0.13157124, 0.39284675, 0.40741112]]), array([0.39310158, 0.64579494, 0.89417534, 0.518014  ]))]


[(array([[-0.00365208, -0.05748956],
         [ 0.0144103 ,  0.22684106],
         [ 0.02711306,  0.42680257]]),
  array([-0.06113624,  0.24123002,  0.45387548])),
 (array([[0.09984452, 0.29811681, 0.30916917],
         [0.16402653, 0.4897521 , 0.50790913],
         [0.22711308, 0.67811658, 0.70325702],
         [0.13157124, 0.39284675, 0.40741112]]),
  array([0.39310158, 0.64579494, 0.89417534, 0.518014  ]))]

In [17]:
# Exercise 6
def create_layers(network_input_size, layer_output_sizes):
    layers = []

    i_size = network_input_size
    for layer_output_size in layer_output_sizes:
        W = np.random.randn(layer_output_size, i_size)
        b = np.random.randn(layer_output_size)
        layers.append((W, b))

        i_size = layer_output_size
    return layers

def create_layers_batch(network_input_size, layer_output_sizes):
    layers = []

    i_size = network_input_size
    for layer_output_size in layer_output_sizes:
        W = np.random.randn(layer_output_size, i_size).T
        b = np.random.randn(layer_output_size)
        layers.append((W, b))

        i_size = layer_output_size
    return layers

def feed_forward(input, layers, activation_funcs):
    a = input
    for (W, b), activation_func in zip(layers, activation_funcs):
        z = W @ a + b
        a = activation_func(z)
    return a

def feed_forward_batch(inputs, layers, activation_funcs):
    a = inputs
    for (W, b), activation_func in zip(layers, activation_funcs):
        z = a @ W + b
        a = activation_func(z)
    return a

def cost(layers, input, activation_funcs, target):
    predict = feed_forward(input, layers, activation_funcs)
    return mse(predict, target)

def cost_batch(layers, inputs, activation_funcs, target):
    predict = feed_forward_batch(inputs, layers, activation_funcs)
    return np.sum(-target * np.log(predict)) # NOT THE CORRECT COST FUNCTION

def feed_forward_saver_batch(inputs, layers, activation_funcs):
    layer_inputs = []
    zs = []
    a = inputs
    for (W, b), activation_func in zip(layers, activation_funcs):
        layer_inputs.append(a)
        z = a @ W + b
        a = activation_func(z)

        zs.append(z)

    return layer_inputs, zs, a

In [18]:
def backpropagation_batch(
    input, layers, activation_funcs, target, activation_ders, cost_der=mse_der
):
    layer_inputs, zs, predict = feed_forward_saver_batch(input, layers, activation_funcs)

    layer_grads = [() for layer in layers]

    # We loop over the layers, from the last to the first
    for i in reversed(range(len(layers))):
        layer_input, z, activation_der = layer_inputs[i], zs[i], activation_ders[i]

        if i == len(layers) - 1:
            # For last layer we use cost derivative as dC_da(L) can be computed directly
            dC_da = cost_der(predict, target)
        else:
            # For other layers we build on previous z derivative, as dC_da(i) = dC_dz(i+1) * dz(i+1)_da(i)
            (W, b) = layers[i + 1]
            dC_da = dC_dz @ W

        print(dC_da)
        print(activation_der(z))
        dC_dz = dC_da @ activation_der(z)
        dC_dW = dC_dz @ np.tensordot(np.eye(len(z)), layer_input, axes=0)
        dC_db = dC_dz # deriv wrt b is 1

        layer_grads[i] = (dC_dW, dC_db)

    return layer_grads

In [19]:
np.seed(8)

inputs = np.random.rand(10, 2)
network_input_size = 2
layer_output_sizes = [3, 4]
activation_funcs = [sigmoid, ReLU]
activation_ders = [sigmoid_der, ReLU_der]

layers = create_layers_batch(network_input_size, layer_output_sizes)

x = np.random.rand(network_input_size)
target = np.random.rand(4)

layer_grads = backpropagation_batch(inputs, layers, activation_funcs, target, activation_ders)
print(layer_grads)

cost_grad = grad(cost, 0)
cost_grad(layers, x, [sigmoid, ReLU], target)

AttributeError: module 'autograd.numpy' has no attribute 'seed'