# Second assignment

Implement the following functions in the code of the Numpy-based neural network described in the exercises:

1. Momentum
2. L1 and L2 regularization

Put the new parts of the source code between the comments “# HF2 start procedure” and “# HF2 end procedure” (procedure = momentum, l1reg, l2reg) and write comments about exactly what you did and why.

Try to keep the code as short as possible!

I wrote the following source code based upon the relevant exercise. The modifications are well commented.

In [1]:
import numpy as np
from sklearn import preprocessing


def activation(x):
    return 1 / (1 + np.exp(-x))


def dactivation(x):
    return np.exp(-x) / ((1 + np.exp(-x)) ** 2)


class MLP:

    def __init__(self, *args):
        np.random.seed(123)
        self.shape = args
        n = len(args)
        self.layers = []
        self.layers.append(np.ones(self.shape[0] + 1))
        for i in range(1, n):
            self.layers.append(np.ones(self.shape[i]))
        self.weights = []
        for i in range(n - 1):
            self.weights.append(np.zeros((self.layers[i].size, self.layers[i + 1].size)))
        self.dw = [0, ] * len(self.weights)
        self.reset()

    def reset(self):
        for i in range(len(self.weights)):
            Z = np.random.random((self.layers[i].size, self.layers[i + 1].size))
            self.weights[i][...] = (2 * Z - 1) * 1

    def propagate_forward(self, data):
        self.layers[0][0:-1] = data

        for i in range(1, len(self.shape)):
            self.layers[i][...] = activation(np.dot(self.layers[i - 1], self.weights[i - 1]))

        return self.layers[-1]

    def propagate_backward(self, target, lrate=0.1):
        deltas = []
        error = - (target - self.layers[-1])
        delta = np.multiply(error, dactivation(np.dot(self.layers[-2], self.weights[-1])))
        deltas.append(delta)
        for i in range(len(self.shape) - 2, 0, -1):
            delta = np.dot(deltas[0], self.weights[i].T) * dactivation(np.dot(self.layers[i - 1], self.weights[i - 1]))
            deltas.insert(0, delta)
        for i in range(len(self.weights)):
            layer = np.atleast_2d(self.layers[i])
            delta = np.atleast_2d(deltas[i])
            dw = -lrate * np.dot(layer.T, delta)
            # HF 2 modification start
            # Here you can try out the different methods required in this assignment.
            # The current setup uses momentum and L2 regularization.
            # I implemented the required functions in a way, that they work with one layer of weights
            # (or weights between two neuron-layers...), so I give the 'i' index-variable to each one of them.

            # With momentum, I add(!) (based upon the PowerPoint presentation downloaded from the website on 10th October(!))
            # the calculated momentum value to dw. (The calculation is based upon the equation seen in the PPT presentation)
            # After tryout, I found, that it's the right way to do, because this results in faster convergence.
            # If I subtract the momentum value from dw (as I saw on the recent PPT presentation and lecture), the
            # convergence of the neural network is actually slower, and the resulting model is worse.
            dw += self.momentum(i)

            # With L1 and L2 regularization, I subtract(!) the calculated value from dw, as I saw on lecture and on the
            # PowerPoint presentation.
            dw -= self.l2reg(lrate, i)
            # dw -= self.l1reg(lrate, i)
            self.weights[i] += dw
            self.dw[i] = dw

        # If we use L1/L2 regularization, the cost function is modified, based on the PowerPoint presentation.
        # So here I add the required summed value to the original cost function value (MSE)
        ret = (error ** 2).sum()
        # ret += self.l1cost()
        ret += self.l2cost()
        # HF2 modification end
        return ret

    # HF2 start momentum
    # In this function I calculate the momentum value with the help of the earlier dw values (self.dw[i]).
    # The equation is from the PowerPoint presentation seen on lecture.
    # I chose 0.5 for the default value of alpha, as I saw on lecture.
    def momentum(self, current_index, alpha=0.5):
        return alpha * self.dw[current_index]

    # HF2 end momentum
    # HF2 start l1reg
    # In this function I calculate the value needed for the weight modification in the case of L1 regularization
    # The equation is from the PowerPoint presentation seen on lecture.
    # I chose 0.00001 for the default value of lambda1, as I saw on lecture.
    def l1reg(self, lrate, current_index, lambda1=0.00001):
        return lrate * lambda1 * np.sign(self.weights[current_index])

    # In this function I calculate the value needed for the modified cost function of L1 regularization
    # in the return value of propagate_backward() method.
    # The equation is from the PowerPoint presentation seen on lecture.
    def l1cost(self, lambda1=0.00001):
        allw = 0.0
        for i in range(len(self.weights)):
            allw += np.absolute(self.weights[i]).sum()
        return lambda1 * allw

    # HF2 end l1reg
    # HF2 start l2reg
    # In this function I calculate the value needed for the weight modification in the case of L2 regularization
    # The equation is from the PowerPoint presentation seen on lecture.
    # I chose 0.00001 for the default value of lambda2, as I saw on lecture.
    def l2reg(self, lrate, current_index, lambda2=0.00001):
        return lrate * lambda2 * self.weights[current_index]

    # In this function I calculate the value needed for the modified cost function of L2 regularization
    # in the return value of propagate_backward() method.
    # The equation is from the PowerPoint presentation seen on lecture.
    def l2cost(self, lambda2=0.00001):
        allw = 0.0
        for i in range(len(self.weights)):
            allw += np.power(self.weights[i], 2).sum()
        return 0.5 * lambda2 * allw

    # HF2 end l2reg


def learn(network, X, Y, valid_split, test_split, epochs=20, lrate=0.1):
    X_train = X[0:int(nb_samples * (1 - valid_split - test_split))]
    Y_train = Y[0:int(nb_samples * (1 - valid_split - test_split))]
    X_valid = X[int(nb_samples * (1 - valid_split - test_split)):int(nb_samples * (1 - test_split))]
    Y_valid = Y[int(nb_samples * (1 - valid_split - test_split)):int(nb_samples * (1 - test_split))]
    X_test = X[int(nb_samples * (1 - test_split)):]
    Y_test = Y[int(nb_samples * (1 - test_split)):]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)

    randperm = np.random.permutation(len(X_train))
    X_train, Y_train = X_train[randperm], Y_train[randperm]

    for i in range(epochs):
        train_err = 0
        for k in range(X_train.shape[0]):
            network.propagate_forward(X_train[k])
            train_err += network.propagate_backward(Y_train[k], lrate)
        train_err /= X_train.shape[0]

        valid_err = 0
        o_valid = np.zeros(X_valid.shape[0])
        for k in range(X_valid.shape[0]):
            o_valid[k] = network.propagate_forward(X_valid[k])
            valid_err += (o_valid[k] - Y_valid[k]) ** 2
        valid_err /= X_valid.shape[0]

        print("%d epoch, train_err: %.4f, valid_err: %.4f" % (i, train_err, valid_err))

    print("\n--- TESTING ---\n")
    test_err = 0
    o_test = np.zeros(X_test.shape[0])
    for k in range(X_test.shape[0]):
        o_test[k] = network.propagate_forward(X_test[k])
        test_err += (o_test[k] - Y_test[k]) ** 2
        print(k, X_test[k], '%.2f' % o_test[k], ' (required result: %.2f)' % Y_test[k])
    test_err /= X_test.shape[0]


if __name__ == "__main__":
    nb_samples = 1000

    network = MLP(2, 10, 1)

    X = np.zeros((nb_samples, 2))
    Y = np.zeros(nb_samples)
    for i in range(0, nb_samples, 4):
        noise = np.random.normal(0, 1, 8)
        X[i], Y[i] = (-2 + noise[0], -2 + noise[1]), 0
        X[i + 1], Y[i + 1] = (2 + noise[2], -2 + noise[3]), 1
        X[i + 2], Y[i + 2] = (-2 + noise[4], 2 + noise[5]), 1
        X[i + 3], Y[i + 3] = (2 + noise[6], 2 + noise[7]), 0

    network.reset()
    learn(network, X, Y, 0.2, 0.1)

0 epoch, train_err: 0.2416, valid_err: 0.2196
1 epoch, train_err: 0.1955, valid_err: 0.1593
2 epoch, train_err: 0.1291, valid_err: 0.1043
3 epoch, train_err: 0.0862, valid_err: 0.0788
4 epoch, train_err: 0.0666, valid_err: 0.0667
5 epoch, train_err: 0.0567, valid_err: 0.0600
6 epoch, train_err: 0.0510, valid_err: 0.0556
7 epoch, train_err: 0.0473, valid_err: 0.0525
8 epoch, train_err: 0.0447, valid_err: 0.0503
9 epoch, train_err: 0.0427, valid_err: 0.0485
10 epoch, train_err: 0.0412, valid_err: 0.0471
11 epoch, train_err: 0.0400, valid_err: 0.0460
12 epoch, train_err: 0.0390, valid_err: 0.0450
13 epoch, train_err: 0.0382, valid_err: 0.0442
14 epoch, train_err: 0.0375, valid_err: 0.0435
15 epoch, train_err: 0.0368, valid_err: 0.0430
16 epoch, train_err: 0.0363, valid_err: 0.0424
17 epoch, train_err: 0.0359, valid_err: 0.0420
18 epoch, train_err: 0.0355, valid_err: 0.0416
19 epoch, train_err: 0.0351, valid_err: 0.0413

--- TESTING ---

0 [-0.49551261 -0.7444607 ] 0.13  (required result: 