# TensorFlow: Overfit and underfit (L2 + Dropout)

In [None]:
import os
import pathlib
import tempfile
import shutil
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage import gaussian_filter1d

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

print("TF Version: ", tf.__version__)
print("TF Eager mode: ", tf.executing_eagerly())
print("TF GPU is", "available" if tf.config.list_physical_devices("GPU") else "not available")

## Prepare dataset

In [None]:
# Load "higgs" dataset
whole_ds, ds_info= tfds.load("higgs", with_info=True)

In [None]:
element = next(whole_ds["train"].take(1).as_numpy_iterator())

print("Data structure:")
print(f"Keys: {list(element.keys())}")
print()
print(f"Values: {list(element.values())}")

In [None]:
# Define the number of features (1 - label, 28 - features)
N_FEATURES = len(ds_info.features) - 1
# Define the size of validation dataset (limit)
N_VALID = int(1e3)
# Define the size of training dataset (limit)
N_TRAIN = int(1e4)
# Define the size of buffer for shuffling
BUFFER_SIZE = int(1e4)
# Define the size of batch size
BATCH_SIZE = 1000
# Define the number of steps per epoch
STEPS_PER_EPOCH = N_TRAIN // BATCH_SIZE

In [None]:
#
# Re-pack dataset in order to have the following element signature: (features, label)
#

def pack(elem):
    values = list(elem.values())
    label = values[:1]
    feats = values[1:]
    return feats, label

val_ds = (whole_ds["train"]
          .take(N_VALID)
          .cache()
          .map(pack)
          .batch(BATCH_SIZE)
          .prefetch(buffer_size=tf.data.AUTOTUNE))

train_ds = (whole_ds["train"]
            .skip(N_VALID)
            .take(N_TRAIN)
            .cache()
            .map(pack)
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .prefetch(buffer_size=tf.data.AUTOTUNE))

## Demonstrate overfitting

In [None]:
size_histories = {}

In [None]:
def plot_history(history, metrics=None):
    if metrics is None:
        metrics = ["loss", "accuracy"]
    plt.figure(figsize=(8, 5))
    for metric in metrics:
        plt.plot(
            gaussian_filter1d(history.history[metric], sigma=2),
            label=f"{metric.capitalize()} (Train)")
        plt.plot(
            gaussian_filter1d(history.history[f"val_{metric}"], sigma=2),
            label=f"{metric.capitalize()} (Validation)")
    plt.xlabel("Epochs")
    plt.ylabel("Value")
    plt.legend()
    plt.show()

In [None]:
def compile_and_fit(model, optimizer, callbacks=None, max_epochs=1_000):
    model.compile(
        optimizer=optimizer,
        loss=tf.losses.BinaryCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.BinaryCrossentropy(from_logits=True),
                 "accuracy"])
    history = model.fit(
        train_ds,
        steps_per_epoch=STEPS_PER_EPOCH,
        epochs=max_epochs,
        validation_data=val_ds,
        callbacks=callbacks,
        verbose=2)
    return history

### Define callbacks

* EarlyStopping - callback to avoid long and unnecessary training times
* TensorBoard - callback to generate TensorBoard logs for the training

In [None]:
log_dir = pathlib.Path(tempfile.mkdtemp()) / "tensorboard_logs"
shutil.rmtree(log_dir, ignore_errors=True)

In [None]:
def get_callbacks(name):
    return [
        tf.keras.callbacks.EarlyStopping(monitor="val_binary_crossentropy", patience=200),
        tf.keras.callbacks.TensorBoard(log_dir / name),
    ]

### Define learning rate scheduler

In [None]:
# Define learning rate scheduler
lr_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(
    1e-3, # The starting learning rate at step
    decay_steps=STEPS_PER_EPOCH*1_000, # A value representing how often to apply the decay
    decay_rate=1, # The factor determining the strength of the decay
    staircase=False) # True - the learning rate remains constant for `decay_steps` and then drops

In [None]:
step = np.linspace(0, 100_000)
lr = lr_scheduler(step)
plt.figure(figsize = (8,6))
plt.plot(step, lr)
plt.ylim([0,max(plt.ylim())])
plt.xlabel("Epoch")
_ = plt.ylabel("Learning Rate")

### Define optimizer

In [None]:
def get_optimizer(scheduler=None):
    if scheduler is None:
        scheduler = lr_scheduler
    return tf.keras.optimizers.Adam(scheduler)

### Try models without regularization

#### Small model

In [None]:
small_model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.elu, input_shape=(N_FEATURES,)),
    tf.keras.layers.Dense(16, activation=tf.nn.elu),
    tf.keras.layers.Dense(1),
])

In [None]:
size_histories["Small"] = compile_and_fit(
    small_model,
    get_optimizer(),
    get_callbacks("sizes/Small"))

In [None]:
plot_history(size_histories["Small"])

#### Medium model

In [None]:
medium_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation=tf.nn.elu, input_shape=(N_FEATURES,)),
    tf.keras.layers.Dense(64, activation=tf.nn.elu),
    tf.keras.layers.Dense(64, activation=tf.nn.elu),
    tf.keras.layers.Dense(1),
])

In [None]:
size_histories["Medium"] = compile_and_fit(
    medium_model,
    get_optimizer(),
    get_callbacks("sizes/Medium"))

In [None]:
plot_history(size_histories["Medium"])

#### Large model

In [None]:
large_model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation=tf.nn.elu, input_shape=(N_FEATURES,)),
    tf.keras.layers.Dense(512, activation=tf.nn.elu),
    tf.keras.layers.Dense(512, activation=tf.nn.elu),
    tf.keras.layers.Dense(512, activation=tf.nn.elu),
    tf.keras.layers.Dense(1),
])

In [None]:
size_histories["Large"] = compile_and_fit(
    large_model,
    get_optimizer(),
    get_callbacks("sizes/Large"))

In [None]:
plot_history(size_histories["Large"])

#### Plot results

* "Small" model - do not overfit
* "Medium" model - **overfit**
* "Large" model - **overfit**

In [None]:
plotter.plot(size_histories)
a = plt.xscale("log")
plt.xlim([5, max(plt.xlim())])
plt.ylim([0.5, 0.7])
plt.xlabel("Epochs [Log Scale]")

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Open an embedded TensorBoard viewer
%tensorboard --logdir {log_dir}/sizes

### Try models with regularization

In [None]:
# Copy "Small" model logs to use as a baseline for comparison
shutil.rmtree(log_dir/"regularizers/Small", ignore_errors=True)
shutil.copytree(log_dir / "sizes/Small", log_dir / "regularizers/Small")

regularizer_histories = {}
regularizer_histories["Small"] = size_histories["Small"]

* L1 regularization: the cost added is proportional to the absolute value of the weights coefficients (pushes weights towards exactly zero, encouraging a sparce model)
* L2 regularization: the cost added is proportional to the square of the value of the weights coefficients (penalize the weights parameters without making them sparce since the penalty goes to zero for small weights - more preferable type of regularization)

#### With L2 regularization

In [None]:
l2_model = tf.keras.Sequential([
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu,
                          kernel_regularizer=tf.keras.regularizers.l2(0.001),
                          input_shape=(N_FEATURES,)),
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu,
                          kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu,
                          kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu,
                          kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dense(1)
])

In [None]:
regularizer_histories["l2"] = compile_and_fit(
    l2_model,
    get_optimizer(),
    get_callbacks("regularizers/l2"))

#### With Dropout regularization

In [None]:
dropout_model = tf.keras.Sequential([
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu,
                          input_shape=(N_FEATURES,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512,
                          activation=tf.nn.elu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [None]:
regularizer_histories["dropout"] = compile_and_fit(
    dropout_model,
    get_optimizer(),
    get_callbacks("regularizers/dropout"))

#### With L2+Dropout regularization

In [None]:
combined_model = tf.keras.Sequential([
    tf.keras.layers.Dense(512,
                          kernel_regularizer=tf.keras.regularizers.l2(0.0001),
                          activation=tf.nn.elu,
                          input_shape=(N_FEATURES,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512,
                          kernel_regularizer=tf.keras.regularizers.l2(0.0001),
                          activation=tf.nn.elu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512,
                          kernel_regularizer=tf.keras.regularizers.l2(0.0001),
                          activation=tf.nn.elu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512,
                          kernel_regularizer=tf.keras.regularizers.l2(0.0001),
                          activation=tf.nn.elu),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [None]:
regularizer_histories["combined"] = compile_and_fit(
    combined_model,
    get_optimizer(),
    get_callbacks("regularizers/combined"))

#### Plot results

In [None]:
plotter.plot(regularizer_histories)
a = plt.xscale("log")
plt.xlim([5, max(plt.xlim())])
plt.ylim([0.5, 0.7])
plt.xlabel("Epochs [Log Scale]")

# According to results the model with combined regularization (L2 + dropout) is the best so far.