### Part B and C

In [9]:
"This notebook consist of the own Feed Fordward Neural Network. This code dervies from the weekly exercise from week 42, and lecture notes from week 38, 39, 40 and 41."


# Imports the necessary libraries
import autograd.numpy as np
from autograd import grad, elementwise_grad
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

np.random.seed(20)

In [10]:
# defining activations functions, and their derivate. 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_der(z):
    s = sigmoid(z)
    return s * (1 - s)

def ReLU(z):
    return np.where(z > 0, z, 0)

def ReLU_der(z):
    return np.where(z > 0, 1, 0)

def leaky_ReLU(z, alpha=0.01):
    return np.where(z >= 0, z, alpha * z)

def leaky_ReLU_der(z, alpha=0.01):
    return np.where(z >= 0, 1, alpha)

# Defining the MSE
def mse(predict, target):
    return np.mean((predict - target) ** 2)

# Defining the R2 score
def R2(y_data, y_model):
    return 1 - np.sum((y_data - y_model)**2) / np.sum((y_data - np.mean(y_data))**2)  #Lecture notes week 35

In [11]:
# Create layers for batched input
def create_layers_batch(network_input_size, layer_output_sizes):
    layers = []

    # Initialize input size for the first layer
    i_size = network_input_size
    # Iterate through each specified layer output size
    for layer_output_size in layer_output_sizes:
        # Initialize weights (W) with small random values and biases (b) with random values
        W = np.random.randn(i_size, layer_output_size)
        #W = np.random.randn(i_size, layer_output_size) * np.sqrt(2 / (i_size + layer_output_size)) #Xavier-initialising
        b = np.random.randn(layer_output_size)
        #b = np.zeros(layer_output_size)
        # Append the layer (weights and biases) to the layers list
        layers.append((W, b))

        # Update the input size for the next layer
        i_size = layer_output_size
    return layers

# Create Feed-forward function for batched input
def feed_forward_batch(inputs, layers, activation_funcs):
    a = inputs 
    # Iterate through each layer and activation function
    for (W, b), activation_func in zip(layers, activation_funcs):
        z = np.dot(a, W) + b
        a = activation_func(z) # Applying the activation functions
    return a

# Calculate the cost using Mean Squared Error (MSE)
def cost_function(layers, inputs, activation_funcs, target):
    predict = feed_forward_batch(inputs, layers, activation_funcs)
    return mse(predict, target)

# Saves the values from the feed-forward function. To be used in the backpropagation. 
def feed_forward_saver_batch(input_batch, layers, activation_funcs):
    layer_inputs = []
    zs = []
    a = input_batch
    for (W, b), activation_func in zip(layers, activation_funcs):
        layer_inputs.append(a)
        z = np.dot(a, W) + b
        a = activation_func(z)
        zs.append(z)

    return layer_inputs, zs, a


# Backpropagation for batched input
def backpropagation_batch(
    input_batch, layers, activation_funcs, targets, activation_ders):
    layer_inputs, zs, predict = feed_forward_saver_batch(input_batch, layers, activation_funcs)
    
    batch_size = input_batch.shape[0]  # Size of the batch
    layer_grads = []  # List to store the gradients in

    error = predict - targets # Iniital differance between predictions and targets

    # Iterate backwards through each layer
    for i in reversed(range(len(layers))):
        layer_input, z = layer_inputs[i], zs[i]

        # Applies the derivate of the activation function except for the last layer
        if i !=len(layers)-1: #To ecxlude the last layer
            error = error * activation_ders[i](z)

        # Calculates the gradients for the weights and biases
        dW = np.dot(layer_input.T, error)/ batch_size
        db = np.mean(error, axis=0)

        # Inserts the gradients
        layer_grads.insert(0, (dW,db))

        # Calculate the error for the next layer
        if i > 0:
            error = np.dot(error, layers[i][0].T)

    return layer_grads


In [None]:
"The Adam optimzer presented the best result in part A, and is therefore used in this section."

# Function to train the network using Adam optimizer
def train_network_adam(inputs, layers, activation_funcs, targets, activation_ders, learning_rate=0.001, epochs=20, batch_size=16):
    n = len(inputs)  # Number of training samples

    # Adam parameters that are set manually
    beta1 = 0.9
    beta2 = 0.999
    delta = 1e-8

    # Initialize moments for Adam
    first_moment_W = [np.zeros_like(W) for W, b in layers]
    second_moment_W = [np.zeros_like(W) for W, b in layers]
    first_moment_b = [np.zeros_like(b) for W, b in layers]
    second_moment_b = [np.zeros_like(b) for W, b in layers]

    # Iterate through epochs
    for epoch in range(epochs):
        for i in range(n):
            random_index = np.random.randint(n)
            batch_inputs = inputs[random_index:random_index + batch_size]
            batch_targets = targets[random_index:random_index + batch_size]

            # Calculate the gradients using backpropagation
            layer_grads = backpropagation_batch(batch_inputs, layers, activation_funcs, batch_targets, activation_ders)


            # Update the weights and biases for each layer using Adam optimizer
            for j, ((W, b), (dW, db)) in enumerate(zip(layers, layer_grads)):
                # Update first moment estimate for weights
                first_moment_W[j] = beta1 * first_moment_W[j] + (1 - beta1) * dW
                second_moment_W[j] = beta2 * second_moment_W[j] + (1 - beta2) * (dW ** 2)

                # Correct bias in first and second moment estimates for weights
                m_hat_w = first_moment_W[j] / (1 - beta1 ** (epoch + 1))
                v_hat_w = second_moment_W[j] / (1 - beta2 ** (epoch + 1))

                # Update the weights
                W_update = learning_rate * m_hat_w / (delta + np.sqrt(v_hat_w))
                layers[j] = (W - W_update, b)

                # Update the first moment estimate for biases
                first_moment_b[j] = beta1 * first_moment_b[j] + (1 - beta1) * db
                second_moment_b[j] = beta2 * second_moment_b[j] + (1 - beta2) * (db ** 2)

                # Correct the bias in first and second moment estimates for biases
                m_hat_b = first_moment_b[j] / (1 - beta1 ** (epoch + 1))
                v_hat_b = second_moment_b[j] / (1 - beta2 ** (epoch + 1))

                # Update the biases
                b_update = learning_rate * m_hat_b / (delta + np.sqrt(v_hat_b))
                layers[j] = (layers[j][0], b - b_update)

    return layers


In [56]:
# Function to make predictions using the trained network
def predict(inputs, layers, activation_funcs):
    return feed_forward_batch(inputs, layers, activation_funcs)

In [None]:
from sklearn.model_selection import train_test_split

# Generate data
x_train = np.linspace(-10,10,100).reshape(-1, 1)
y_train = (2.0 + 5 * x_train + 0.1 * x_train**2).reshape(-1,1)


# Scales the data using StandardScaler from Scikit-learn
scaler_y = StandardScaler()
scaler_x= StandardScaler()

# Standardiser x_train og y_train
x_train_scaled = scaler_x.fit_transform(x_train)
y_train_scaled = scaler_y.fit_transform(y_train)

# Splits the dataset into test and training
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train_scaled, y_train_scaled, test_size=0.2, random_state=20)


# Defines the in- and output-size of the network
network_input_size = 1 
layer_output_sizes = [10,10, 1] #Two hidden layers with 10 nodes. 

# Creates the layers with the specified structure
layers = create_layers_batch(network_input_size, layer_output_sizes)

Sigmoid Activation 

In [156]:
np.random.seed(20)
layers = create_layers_batch(network_input_size, layer_output_sizes)
# Sets the sigmoid funciton for the hidden layers

activation_funcs = [sigmoid, sigmoid, lambda x:x]
activation_ders = [sigmoid_der, sigmoid_der, lambda x: np.ones_like(x)]

# Train the network, defines learning rate, batch size and epochs. 
train_network_adam(x_train_split, layers, activation_funcs, y_train_split, activation_ders, learning_rate=0.001,epochs=100, batch_size=16)

# Makes predictions using the trained network 
pred_scaled = predict(x_test_split, layers, activation_funcs)
# Invers the predicitons to make them the original scale
pred = scaler_y.inverse_transform(pred_scaled)
# Inverse the train split data to make them the original scale
y_test_inverse = scaler_y.inverse_transform(y_test_split)

#Calculates the MSE for the sigmoid function
mse_sig= mse(y_test_inverse, pred)
print( f"MSE:{mse_sig:.4f}")
r2_sig = R2(y_test_inverse,pred)
print( f"R2:{r2_sig:.4f}")

MSE:4.6478
R2:0.9949


ReLU activation function

In [126]:
np.random.seed(20)
# Re-initialize the network layers before testing with ReLU activation function
layers = create_layers_batch(network_input_size, layer_output_sizes)

# Sets the ReLU funciton for the hidden layers
activation_funcs = [ReLU, ReLU, lambda x:x]
activation_ders = [ReLU_der, ReLU_der, lambda x: np.ones_like(x)]

# Train the network, defines learning rate, batch size and epochs. 
train_network_adam(x_train_split, layers, activation_funcs, y_train_split, activation_ders, learning_rate=0.001,epochs=100, batch_size=16)

# Makes predictions using the trained network 
pred_scaled = predict(x_test_split, layers, activation_funcs)
# Inverstransforms the predicitons to make them the original scale
pred = scaler_y.inverse_transform(pred_scaled)
# Inverse the train split data to make them the original scale
y_test_inverse = scaler_y.inverse_transform(y_test_split)

#Calculates the MSE for the ReLU function
mse_ReLU= mse(y_test_inverse, pred)
print( f"MSE:{mse_ReLU:.4f}")

r2_ReLU = R2(y_test_inverse, pred)
print( f"R2:{r2_ReLU:.4f}")

MSE:1.4478
R2:0.9984


Leaky-ReLU activation function

In [127]:
np.random.seed(20)

# Re-initialize the network layers before testing with ReLU activation function
layers = create_layers_batch(network_input_size, layer_output_sizes)

# Sets the leaky-ReLU funciton for the hidden layers
activation_funcs = [leaky_ReLU, leaky_ReLU, lambda x:x]
activation_ders = [leaky_ReLU_der, leaky_ReLU_der, lambda x: np.ones_like(x)]

# Train the network, defines learning rate, batch size and epochs. 
train_network_adam(x_train_split, layers, activation_funcs, y_train_split, activation_ders, learning_rate=0.001,epochs=100, batch_size=16)

# Makes predictions using the trained network 
pred_scaled = predict(x_test_split, layers, activation_funcs)
# Inverstransforms the predicitons to make them the original scale
pred = scaler_y.inverse_transform(pred_scaled)
# Inverse the train split data to make them the original scale
y_test_inverse = scaler_y.inverse_transform(y_test_split)

#Calculates the MSE for the sigmoid function
mse_leakyReLU= mse(y_test_inverse, pred)
print( f"MSE:{mse_leakyReLU:.4f}")

r2_LeakyReLU = R2(y_test_inverse, pred)
print( f"R2:{r2_LeakyReLU:.4f}")

MSE:2.0979
R2:0.9977


Comparing the results with MLP-regressor from Scikit-learn

In [149]:
from sklearn.model_selection import train_test_split

# Generate data
x_train = np.linspace(-10,10,100).reshape(-1, 1)
y_train = (2.0 + 5 * x_train + 0.1 * x_train**2).reshape(-1,1)


# Scales the data using StandardScaler from Scikit-learn
scaler_y = StandardScaler()
scaler_x= StandardScaler()

# Standardiser x_train og y_train
x_train_scaled = scaler_x.fit_transform(x_train)
y_train_scaled = scaler_y.fit_transform(y_train)


x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train_scaled, y_train_scaled, test_size=0.2, random_state=20)


# Defines the in- and output-size of the network
network_input_size = 1 
layer_output_sizes = [10,10, 1] #Two hidden layers with 10 nodes. 

# Creates the layers with the specified structure
layers = create_layers_batch(network_input_size, layer_output_sizes)

In [148]:
# Comparing with mlp-regressor from Skicit-learn 
np.random.seed(20)

from sklearn.neural_network import MLPRegressor

mlp_regressor = MLPRegressor(hidden_layer_sizes=(10,10), activation="logistic", solver="adam", learning_rate_init=0.001, max_iter=500, tol=1e-6, random_state=20, early_stopping=True)
mlp_regressor.fit(x_train_split, y_train_split.ravel())
mlp_pred_scaled = mlp_regressor.predict(x_test_split).reshape(-1,1)
mlp_pred = scaler_y.inverse_transform(mlp_pred_scaled)
y_test_inverse = scaler_y.inverse_transform(y_test_split)

mse_mlp = mse(y_test_inverse, mlp_pred)
print( f"MSE using Scikit-learn:{mse_mlp:.4f}")
print( f"MSE own NN using Sigmoid activation function:{mse_sig:.4f}")

r2_mlp = R2(y_test_inverse, mlp_pred)
print(f"R² using Scikit-learn: {r2_mlp:.4f}")
print(f"R² own NN using Sigmoid activation function: {r2_sig:.4f}")

MSE using Scikit-learn:4.4887
MSE own NN using Sigmoid activation function:4.6478
R² using Scikit-learn: 0.9950
R² own NN using Sigmoid activation function: 0.9949


In [152]:
from sklearn.model_selection import train_test_split

# Generate data
x_train = np.linspace(-10,10,100).reshape(-1, 1)
y_train = (2.0 + 5 * x_train + 0.1 * x_train**2).reshape(-1,1)


# Scales the data using StandardScaler from Scikit-learn
scaler_y = StandardScaler()
scaler_x= StandardScaler()

# Standardiser x_train og y_train
x_train_scaled = scaler_x.fit_transform(x_train)
y_train_scaled = scaler_y.fit_transform(y_train)


x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train_scaled, y_train_scaled, test_size=0.2, random_state=20)


# Defines the in- and output-size of the network
network_input_size = 1 
layer_output_sizes = [10,10, 1] #Two hidden layers with 10 nodes. 

# Creates the layers with the specified structure
layers = create_layers_batch(network_input_size, layer_output_sizes)


mlp_regressor = MLPRegressor(hidden_layer_sizes=(10,10), activation="relu", solver="adam", learning_rate_init=0.001, max_iter=600, tol=1e-6, random_state=20, early_stopping=True)
mlp_regressor.fit(x_train_split, y_train_split.ravel())
mlp_pred_scaled = mlp_regressor.predict(x_test_split)  #Predict on the test data
mlp_pred = scaler_y.inverse_transform(mlp_pred_scaled.reshape(-1,1))
y_test_inverse = scaler_y.inverse_transform(y_test_split)

mse_mlp = mse(y_test_inverse, mlp_pred)
print( f"MSE using Scikit-learn:{mse_mlp:.4f}")
print( f"MSE own NN using ReLU activation function:{mse_ReLU:.4f}")

r2_mlp = R2(y_test_inverse, mlp_pred)
print(f"R² using Scikit-learn: {r2_mlp:.4f}")
print(f"R² own NN using ReLU activation function: {r2_ReLU:.4f}")

MSE using Scikit-learn:3.4037
MSE own NN using ReLU activation function:1.4478
R² using Scikit-learn: 0.9962
R² own NN using ReLU activation function: 0.9984
