# Machine Learning Essentials SS25 - Exercise Sheet 4

## Instructions
- `TODO`'s indicate where you need to complete the implementations.
- You may use external resources, but <b>write your own solutions</b>.
- Provide concise, but comprehensible comments to explain what your code does.
- Code that's unnecessarily extensive and/or not well commented will not be scored.

## Exercise 2

In [259]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

### Task 2

In [None]:
# TODO: Define the needed helper functions
def tanh_prime(x_activated):
    return 

def sigmoid(x):
    return 

def binary_cross_entropy(y_hat, y):
    """
    Computes the BCE loss over samples.
    """
    # Hint: Add a small epsilon to y_hat to prevent numerical issues w/ log(0) issues (that's common practice in these cases)
    return

In [None]:
# MLP Class
class MLP:
    def __init__(self, layer_dims, initialization_scale=1):
        """
        Initializes the multi-layer perceptron.
        Args:
            layer_dims (list of int): List containing the number of neurons in each layer.
                                      [d0, d1, d2] = [2, 10, 1] for the exercise.
                                      d0: input dimension
                                      d1: hidden layer dimension
                                      d2: output dimension
            initialization_scale (float): Scaling factor for weight initialization (i.e. standard deviation of the normal distribution)
        """
        self.parameters = {}
        self.num_layers = len(layer_dims) 

        # Weights are initialized by drawing from a standard normal distribution, biases are initialized as zero.
        # For more complex networks, one usually uses techniques like Xavier or He initialization. Play around with the initialization_scale parameter to see how it affects the training!
        # Layer 0 -> Layer 1
        self.parameters['W0'] = np.random.randn(layer_dims[1], layer_dims[0]) * initialization_scale 
        self.parameters['b0'] = np.zeros((layer_dims[1], 1))
        # Layer 1 -> Layer 2
        self.parameters['W1'] = np.random.randn(layer_dims[2], layer_dims[1]) * initialization_scale
        self.parameters['b1'] = np.zeros((layer_dims[2], 1))
        
        self.cache = {} # For storing intermediate values (needed for backprop)

    def forward(self, X_batch):
        """
        Forward pass through the network. Store intermediate values in self.cache for backward pass.
        """
        # TODO: Implement the forward pass & store the needed values in self.cache
        return y_hat

    def backward(self, Y_batch):
        """
        Performs the backward pass (= backpropagation) to compute gradients of the loss with respect to the parameters.
        Gradients are stored in the grads dictionary (see update_params method).
        """
        # TODO: Implement the backward pass
        return grads

    def update_params(self, grads, learning_rate):
        """
        Updates the parameters using gradient descent.
        Args:
            grads (dict): Dictionary of gradients.
            learning_rate (float): The learning rate.
        """
        self.parameters['W0'] -= learning_rate * grads['dW0']
        self.parameters['b0'] -= learning_rate * grads['db0']
        self.parameters['W1'] -= learning_rate * grads['dW1']
        self.parameters['b1'] -= learning_rate * grads['db1']

### Task 3
TODO: Explain why using vectorized operations is generally preferred in ML.

### Task 4

In [None]:
#Data loading and preprocessing (predefined)
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# NOTE: Different libraries/languages use different conventions for the shape of the data matrix X, which comes from the way they store data in memory:
#   - In ML textbooks/mathematical notation, X is often (n_features, n_samples), because each sample is a (n_features, 1) column vector and we stack them "horizontally".
#   - This is consistent with some languages (e.g. Julia, Matlab), which store data in column-major order.
#   - However, most ML code (e.g. NumPy, sklearn, Pytorch) is optimized for row-major order, so you will in code most often see data matrices of shape (n_samples, n_features).
#     (The reason for that is that most of these libraries run C/C++/CUDA code under the hood, which is optimized for row-major order)
# --> Juggling the shapes of arrays to be correctly aligned with the used model implementation / convention is a day-to-day task in practical ML and the cause of many bugs, so always double-check the expected format.

# TODO: The MLP class you're using expects its inputs in a specific shape, make sure your X and y match that convention.
print(f"Shape of X_train: {X_train.shape}") 
print(f"Shape of y_train: {y_train.shape}") 
print(f"Shape of X_test: {X_test.shape}") 
print(f"Shape of y_test: {y_test.shape}")

In [None]:
# Training
layer_dimensions = [X_train.shape[0], 10, 1] # d0, d1, d2 as given in the exercise
mlp = MLP(layer_dimensions) # Initialize the MLP

# Hyperparameters
learning_rate = # TODO: Experiment with this
num_epochs = # TODO: Experiment with this
print_loss = # To monitor the training process, print the loss every few epochs
train_losses = []

for epoch in range(1,num_epochs+1):
    # Forward pass
    y_hat_train = mlp.forward(X_train)
    # Compute loss
    train_loss = binary_cross_entropy(y_hat_train, y_train)
    # Backward pass = backprop
    grads = mlp.backward(y_train)
   # Update parameters by gradient descent
    mlp.update_params(grads, learning_rate)
    
    if epoch % print_loss == 0 or epoch == num_epochs:
       train_losses.append(train_loss)
       print(f"Epoch {epoch}/{num_epochs} - Training Loss: {train_loss:.4f}")

In [None]:
# Evaluation 
# TODO: Compute the accuracy on the test set and plot the decision boundary over the test set, comment on the performance