# Binary classification: overfitting and early stopping
In this experiment, our primary focus is on mitigating the issue of overfitting in the training data. In the initial phase, we closely monitor the training process of a compact neural network designed for a binary classification problem on a synthetic dataset. By observing the model's performance metrics, such as loss and accuracy, on the validation dataset, we aim to assess whether the model is exhibiting signs of overfitting the training data.

Subsequently, we implement an early stopping technique to tackle the problem of overfitting. Early stopping is a preventive measure used during training to halt the process when the model's performance on the validation dataset ceases to improve. This approach helps us maintain a balance between learning from the training data and generalizing to unseen data, mitigating the risk of overfitting.

In [None]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Machine Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import make_classification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import set_random_seed

# disable GPUs for test reproducibility
tf.config.set_visible_devices([], 'GPU')

SEED = 0
np.random.seed(SEED)
set_random_seed(SEED)

In [None]:
# Generate synthetic data

SAMPLES = 200
# Generate synthetic data for binary classification
X, y = make_classification(n_samples=SAMPLES, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, class_sep=1,flip_y=0.03,random_state=SEED)
# Split data into training and testing sets
X_train_orig, X_val_orig, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED)


# Add polynomial features
poly_features = PolynomialFeatures(degree=4, include_bias=False)
X_train = poly_features.fit_transform(X_train_orig)
X_val = poly_features.fit_transform(X_val_orig)

# Model definition and training
In the following cell, we define and compile a neural network model. Then we train it for 1000 epochs.

In [None]:
# Create a neural network model using Keras
model = Sequential()
model.add(Dense(24, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Save the weights for later
model_initial_weights = model.get_weights()

# Print a summary of the model architecture
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=1000, validation_data=(X_val, y_val), verbose=1)

# Training log analysis
By looking at the plots of accuracy and loss, one can understand whether the model overfits the training data, and whether the training process should have been stopped earlier. For instance, one can check where the validation loss start increasing.

In [None]:
# Plot training and validation accuracy over epochs
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.show()

# Plot training and validation loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.ylim([0, 2])
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# Decision boundary
Let's check the decision boundary and how it fits the training data

In [None]:
# Plot the decision boundary as a single curve
plt.figure(figsize=(8, 6))
h = .02  # Step size in the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Transform meshgrid points into polynomial features
xx_poly = poly_features.transform(np.c_[xx.ravel(), yy.ravel()])

# Predict probabilities for each point on the meshgrid
Z = model.predict(xx_poly)
Z = Z.reshape(xx.shape)

# Plot the contour line representing the decision boundary (where probability is 0.5)
plt.contour(xx, yy, Z, levels=[0.5], colors='black')

# Plot the data points
plt.scatter(X_train_orig[:, 0], X_train_orig[:, 1],  c=y_train, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data Points')
plt.show()

# Evaluate the model on the training data
loss, accuracy = model.evaluate(X_train, y_train)
print(f'Training Loss: {loss:.4f}')
print(f'Training Accuracy: {accuracy:.4f}')

# Decision boundary on unseen data
Let's check the decision boundary and how it splits the validation data

In [None]:
# Plot the decision boundary as a single curve
plt.figure(figsize=(8, 6))
h = .02  # Step size in the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Transform meshgrid points into polynomial features
xx_poly = poly_features.transform(np.c_[xx.ravel(), yy.ravel()])

# Predict probabilities for each point on the meshgrid
Z = model.predict(xx_poly)
Z = Z.reshape(xx.shape)

# Plot the contour line representing the decision boundary (where probability is 0.5)
plt.contour(xx, yy, Z, levels=[0.5], colors='black')

# Plot the data points
plt.scatter(X_val_orig[:, 0], X_val_orig[:, 1],  c=y_val, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data Points')
plt.show()

# Evaluate the model on the validation data
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')

# Early stopping strategy
Instead of setting a fixed number of training epochs, one can implement an early-stopping strategy, which automatically stop the training process when the validation accuracy stops increasing or the validation loss stop decreasing. An important parameter is called **patience**, which represents the number of epochs with no improvement (on the validation set) after which the training will be stopped.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

PATIENCE = 10 #number of epochs with no improvement on the validation set

# Define early stopping criteria (you can monitor either val_loss or val_accuracy)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, restore_best_weights=True)


# Re-initialize the weights of the model
model.set_weights(model_initial_weights)
# Retrain the model with early-stopping
history = model.fit(X_train, y_train, epochs=1000, validation_data=(X_val, y_val), verbose=1, callbacks=[early_stopping])

# Training log analysis
By looking at the plots of accuracy and loss, now we can see where the training process has been stopped.

In [None]:
# Plot training and validation accuracy over epochs
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.show()

# Plot training and validation loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.ylim([0, 2])
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

In [None]:
# Plot the decision boundary as a single curve
plt.figure(figsize=(8, 6))
h = .02  # Step size in the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Transform meshgrid points into polynomial features
xx_poly = poly_features.transform(np.c_[xx.ravel(), yy.ravel()])

# Predict probabilities for each point on the meshgrid
Z = model.predict(xx_poly)
Z = Z.reshape(xx.shape)

# Plot the contour line representing the decision boundary (where probability is 0.5)
plt.contour(xx, yy, Z, levels=[0.5], colors='black')

# Plot the data points
plt.scatter(X_train_orig[:, 0], X_train_orig[:, 1],  c=y_train, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data Points')
plt.show()

# Evaluate the model on the training data
loss, accuracy = model.evaluate(X_train, y_train)
print(f'Training Loss: {loss:.4f}')
print(f'Training Accuracy: {accuracy:.4f}')

In [None]:
# Plot the decision boundary as a single curve
plt.figure(figsize=(8, 6))
h = .02  # Step size in the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Transform meshgrid points into polynomial features
xx_poly = poly_features.transform(np.c_[xx.ravel(), yy.ravel()])

# Predict probabilities for each point on the meshgrid
Z = model.predict(xx_poly)
Z = Z.reshape(xx.shape)

# Plot the contour line representing the decision boundary (where probability is 0.5)
plt.contour(xx, yy, Z, levels=[0.5], colors='black')

# Plot the data points
plt.scatter(X_val_orig[:, 0], X_val_orig[:, 1],  c=y_val, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data Points')
plt.show()

# Evaluate the model on the validation data
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')