# Avoiding overfitting with regularisation
In this experiment, our primary focus is on mitigating the issue of overfitting the training data. In the initial phase, we closely monitor the training process of a compact neural network designed for a binary classification problem on a synthetic dataset. By observing the model's performance metrics, such as loss and accuracy, on the validation dataset, we aim to assess whether the model is exhibiting signs of overfitting the training data.

Subsequently, we implement various regularisation techniques to to tackle the problem of overfitting: L1 and L2 regularisation, Dropout and Early Stopping

In [None]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Machine Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import make_classification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import set_random_seed

# disable GPUs for test reproducibility
tf.config.set_visible_devices([], 'GPU')

SEED = 0
np.random.seed(SEED)
set_random_seed(SEED)

# Generate synthetic data

SAMPLES = 200
# Generate synthetic data for binary classification
X, y = make_classification(n_samples=SAMPLES, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, class_sep=1,flip_y=0.03,random_state=SEED)
# Split data into training and testing sets
X_train_orig, X_val_orig, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED)


# Add polynomial features
poly_features = PolynomialFeatures(degree=4, include_bias=False)
X_train = poly_features.fit_transform(X_train_orig)
X_val = poly_features.fit_transform(X_val_orig)

In [None]:
def plot_history(history):
    # Plot training and validation accuracy over epochs
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')
    plt.show()

    # Plot training and validation loss over epochs
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.ylim([0, 2])
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.show()

In [None]:
def evaluate_model(model):
    # Evaluate the model on the training data
    loss, accuracy = model.evaluate(X_train, y_train)
    print(f'Training Loss: {loss:.4f}')
    print(f'Training Accuracy: {accuracy:.4f}')

    # Evaluate the model on the validation data
    loss, accuracy = model.evaluate(X_val, y_val)
    print(f'Validation Loss: {loss:.4f}')
    print(f'Validation Accuracy: {accuracy:.4f}')

# Model definition and training
In the following cell, we define and compile a neural network model. Then we train it for 1000 epochs.

In [None]:
# Create a neural network model using Keras
overfit_model = Sequential()
overfit_model.add(Dense(24, input_shape=(X_train.shape[1],), activation='relu'))
overfit_model.add(Dense(1, activation='sigmoid'))

# Compile the model
overfit_model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Save the weights for later
model_initial_weights = overfit_model.get_weights()

# Print a summary of the model architecture
overfit_model.summary()

# Train the model
history = overfit_model.fit(X_train, y_train, epochs=1000, validation_data=(X_val, y_val), verbose=1)

plot_history(history)

# Model evaluation
Let's compare the model's performance on training and validation data

In [None]:
evaluate_model(overfit_model)

# Early stopping strategy
Instead of setting a fixed number of training epochs, one can implement an early-stopping strategy, which automatically stop the training process when the validation accuracy stops increasing or the validation loss stop decreasing. An important parameter is called **patience**, which represents the number of epochs with no improvement (on the validation set) after which the training will be stopped.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
PATIENCE = 10 #number of epochs with no improvement on the validation set

# Create a neural network model using Keras
early_stopping_model = Sequential()
early_stopping_model.add(Dense(24, input_shape=(X_train.shape[1],), activation='relu'))
early_stopping_model.add(Dense(1, activation='sigmoid'))

# Compile the model
early_stopping_model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping criteria (you can monitor either val_loss or val_accuracy)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, verbose=1, restore_best_weights=True)


# Re-initialize the weights of the model
early_stopping_model.set_weights(model_initial_weights)
# Retrain the model with early-stopping
history = early_stopping_model.fit(X_train, y_train, epochs=1000, validation_data=(X_val, y_val), verbose=1, callbacks=[early_stopping])

plot_history(history)

# Model evaluation
Let's compare the model's performance on training and validation data

In [None]:
evaluate_model(early_stopping_model)

# L1 and L2 regularisation
Instead stopping the training process, this time we try to avoid overfitting by applying L1 and L2 regularisation. The key parameter here is \(\lambda\), which controls the strength of the regularization term.

**L1 regularisation**:  \( J_W(y,\hat{y}) = -\frac{1}{m} \sum_{i=1}^{m} \left( y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i) \right) + \lambda \sum_{l=1}^{L} \sum_{j=1}^{n^{(l)}} |w_{j}^{(l)}| \)
**L2 regularisation**:  \( J_W(y,\hat{y}) = -\frac{1}{m} \sum_{i=1}^{m} \left( y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i) \right) + \lambda \sum_{l=1}^{L} \sum_{j=1}^{n^{(l)}} (w_{j}^{(l)})^2 \)

- \( m \) is the number of samples in a mini-batch of training samples,
- \( y_i \) represents the true labels (either 0 or 1) of the \( i \)th sample,
- \( \hat{y}_i \) represents the predicted probability of the \( i \)th sample being in class 1,
- \( L \) is the total number of layers in the neural network,
- \( n^{(l)} \) is the total number of weights in layer \( l \),
- \( w_{j}^{(l)} \) represents the individual weights in layer \( l \),
- \( \lambda \) (lambda) is the regularization parameter, controlling the strength of the L1 regularization. A higher value of \( \lambda \) leads to stronger regularization.

In [None]:
from keras.regularizers import l1,l2
# Create a neural network model using Keras
l1_model = Sequential()

# Use either l1 or l2 regularisation and try increasing the value of lambda
l1_model.add(Dense(24, input_shape=(X_train.shape[1],), activation='relu',kernel_regularizer=l2(0.3)))
l1_model.add(Dense(1, activation='sigmoid'))

# Compile the model
l1_model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Re-initialize the weights of the model
l1_model.set_weights(model_initial_weights)
# Retrain the model with early-stopping
history = l1_model.fit(X_train, y_train, epochs=1000, validation_data=(X_val, y_val), verbose=1)

plot_history(history)

# Model evaluation
Let's compare the model's performance on training and validation data

In [None]:
evaluate_model(l1_model)

# Dropout
Dropout randomly deactivates a fraction of neurons during training, meaning these neurons are ignored during forward and backward passes. At every training step, every neuron (including the input neurons, but always excluding the output neurons) has a probability \(p\) of being temporarily “dropped out,” meaning it will be entirely ignored during this training step, but it may be active during the next step. The hyperparameter \(p\) is called the ```dropout rate```, and it is typically set between 10\% and 50\%.

In [None]:
from keras.layers import Dropout
# Create a neural network model using Keras
DROPOUT_RATE = 0.95

dropout_model = Sequential()

# Use either l1 or l2 regularisation and try increasing the value of lambda
dropout_model.add(Dense(24, input_shape=(X_train.shape[1],), activation='relu'))
dropout_model.add(Dropout(DROPOUT_RATE))  # Dropout with a dropout rate of 0.5 (50% of neurons will be dropped out during training)
dropout_model.add(Dense(1, activation='sigmoid'))

# Compile the model
dropout_model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Re-initialize the weights of the model
dropout_model.set_weights(model_initial_weights)
# Retrain the model with early-stopping
history = dropout_model.fit(X_train, y_train, epochs=1000, validation_data=(X_val, y_val), verbose=1)

plot_history(history)

# Model evaluation
Let's compare the model's performance on training and validation data

In [None]:
evaluate_model(dropout_model)