# Overfitting in Neural Networks

You are advised to run this Jupyter Notebook on Google Colab. From the Colab toolbar, select *Runtime* > *Change runtime type* > *T4 GPU* > *Save* before running the Notebook.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from keras.utils import img_to_array
from keras.utils import load_img

from keras import Model
from keras import Sequential
from keras import Input
from keras.layers import Dense
from keras.layers import Rescaling
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import RandomFlip
from keras.layers import RandomRotation
from keras.layers import RandomZoom
from keras.layers import RandomTranslation

from keras.optimizers import RMSprop

from keras.regularizers import l2

from keras.callbacks import EarlyStopping

from keras.datasets import cifar10

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
def plot_keras_history(history, metric):
    fig, axes = plt.subplots(1, 2, figsize=(6, 3))
    fig.tight_layout()
    axes[0].plot(history.history["loss"], label="train loss")
    axes[0].plot(history.history["val_loss"], label="val loss")
    axes[0].set_title("Loss")
    axes[0].legend()
    axes[1].plot(history.history[metric], label="train " + metric)
    axes[1].plot(history.history["val_" + metric], label="val " + metric)
    axes[1].set_title(metric)
    axes[1].legend()
    plt.show()

## CIFAR10 dataset

The CIFAR10 dataset has 50,000 32x32 colour training images and 10,000 test images, with 10 classes: 0	airplane, 1	automobile, 2	bird, 3	cat, 4	deer, 5	dog, 6	frog, 7	horse, 8	ship and 9	truck.

In [None]:
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

In [None]:
plt.figure(figsize=(8, 8))
for i in range(12):
    ax = plt.subplot(3, 4, i + 1)
    plt.imshow(X_train[i])
    plt.title(y_train[i])
    plt.axis("off")

## A network that overfits

In [None]:
inputs = Input(shape=(32, 32, 3))
x = Rescaling(scale=1./255)(inputs)
x = Conv2D(filters=128, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(units=64, activation="relu")(x)
outputs = Dense(units=10, activation="softmax")(x)
overfitting_convnet = Model(inputs, outputs)

In [None]:
overfitting_convnet.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
overfitting_convnet_history = overfitting_convnet.fit(X_train, y_train, epochs=20, validation_split=0.25, batch_size=32, verbose=0)

In [None]:
train_acc, val_acc = overfitting_convnet_history.history["accuracy"][-1], overfitting_convnet_history.history["val_accuracy"][-1]
train_acc, val_acc

## Reducing complexity - by reducing network size

In [None]:
inputs = Input(shape=(32, 32, 3))
x = Rescaling(scale=1./255)(inputs)
x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(filters=32, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(units=16, activation="relu")(x)
outputs = Dense(units=10, activation="softmax")(x)
convnet = Model(inputs, outputs)

In [None]:
convnet.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
convnet_history = convnet.fit(X_train, y_train, epochs=20, validation_split=0.25, batch_size=32, verbose=0)

In [None]:
train_acc, val_acc = convnet_history.history["accuracy"][-1], convnet_history.history["val_accuracy"][-1]
train_acc, val_acc

## Weight regularization

In [None]:
inputs = Input(shape=(32, 32, 3))
x = Rescaling(scale=1./255)(inputs)
x = Conv2D(filters=128, kernel_size=(3, 3), activation="relu", kernel_regularizer=l2(0.001))(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu", kernel_regularizer=l2(0.001))(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(units=64, activation="relu", kernel_regularizer=l2(0.001))(x)
outputs = Dense(units=10, activation="softmax", kernel_regularizer=l2(0.001))(x)
convnet = Model(inputs, outputs)

In [None]:
convnet.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
convnet_history = convnet.fit(X_train, y_train, epochs=20, validation_split=0.25, batch_size=32, verbose=0)

In [None]:
train_acc, val_acc = convnet_history.history["accuracy"][-1], convnet_history.history["val_accuracy"][-1]
train_acc, val_acc

## Dropout

In [None]:
inputs = Input(shape=(32, 32, 3))
x = Rescaling(scale=1./255)(inputs)
x = Conv2D(filters=128, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(rate=0.3)(x)
x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(rate=0.3)(x)
x = Flatten()(x)
x = Dense(units=64, activation="relu")(x)
outputs = Dense(units=10, activation="softmax")(x)
convnet = Model(inputs, outputs)

In [None]:
convnet.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
convnet_history = convnet.fit(X_train, y_train, epochs=20, validation_split=0.25, batch_size=32, verbose=0)

In [None]:
train_acc, val_acc = convnet_history.history["accuracy"][-1], convnet_history.history["val_accuracy"][-1]
train_acc, val_acc

## Early Stopping

In [None]:
plot_keras_history(overfitting_convnet.history, "accuracy")

In [None]:
inputs = Input(shape=(32, 32, 3))
x = Rescaling(scale=1./255)(inputs)
x = Conv2D(filters=128, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(units=64, activation="relu")(x)
outputs = Dense(units=10, activation="softmax")(x)
convnet = Model(inputs, outputs)

In [None]:
convnet.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
convnet_history = convnet.fit(X_train, y_train, epochs=20, validation_split=0.25, batch_size=32, verbose=0,
                              callbacks=[EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)])


In [None]:
plot_keras_history(convnet.history, "accuracy")

In [None]:
train_acc, val_acc = convnet_history.history["accuracy"][-1], convnet_history.history["val_accuracy"][-1]
train_acc, val_acc

## Data Augmentation

Let's see what data augmentation does - before we use it in our neural network.

You can look-up the details in the Keras API documentation, but a quick summary is:
- We may flip. For this dataset, only horizontal flipping makes sense.
- We may rotate. Here the rotations are random in the range $[-0.1\times 2\pi, 0.1\times\pi]$ radians. (Recall radians from school: $2 \pi$ radians is a full circle.)
- We may zoom. Here aspect ratio is preserved because we only specify a height factor. Negative means zooming in; positive means zooming out.
- And we may shift. Here, we've asked for left/right shifts of up to 20% and up/down shifts of up to 20%.

In [None]:
# Load an image
img = load_img(os.path.join(dataset_dir, "wikipedia_cats_and_dogs/Retriever_in_water.jpg"))
input_array = np.array([img_to_array(img)])

# Create some augmentation layers
augmentation_layers = Sequential([
   Input(shape=(244, 244, 3)),
   RandomFlip(mode="horizontal"),
   RandomRotation(factor=0.1),
   RandomZoom(height_factor=(-0.2, 0.2)),
   RandomTranslation(height_factor=0.2, width_factor=0.2)
])

# Apply the augmentation layers to the image and display
plt.figure(figsize=(10, 10))
for i in range(9):
    augmented_images = augmentation_layers(input_array)
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(augmented_images[0].numpy().astype("uint8"))
    plt.axis("off")

Now let's use these layers in our network. These layers will only augment the training data, not the validation or test data.

In [None]:
inputs = Input(shape=(32, 32, 3))
x = RandomFlip(mode="horizontal")(x)
x = RandomRotation(factor=0.1)(x)
x = RandomZoom(height_factor=(-0.2, 0.2))(x)
x = RandomTranslation(height_factor=0.2, width_factor=0.2)(x)
x = Rescaling(scale=1./255)(inputs)
x = Conv2D(filters=128, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(units=64, activation="relu")(x)
outputs = Dense(units=10, activation="softmax")(x)
overfitting_convnet = Model(inputs, outputs)

In [None]:
convnet.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

We run for more epochs and it will synthesize new training examples on every batch. (On its own, it's not very effective here.)

In [None]:
convnet_history = convnet.fit(X_train, y_train, epochs=40, validation_split=0.25, batch_size=32, verbose=0)

In [None]:
train_acc, val_acc = convnet_history.history["accuracy"][-1], convnet_history.history["val_accuracy"][-1]
train_acc, val_acc

Finally, we should do Error Estimation - to compare the models on the test set. We should - but we won't, since you're already familiar with how to do it.

# Closing Remarks

Don't forget that Batch Normalization also has a regularizing effect.

CIFAR10 is a relatively easy dataset. Therefore, in practice, you wouldn't have started with such a large network. And, of course, you can use these solutions together. For example, I would have started with a smaller network, added a Dropout layer and Early Stopping - and I would see where that got me before trying other things.