In [1]:
%%bash

pip install lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection, pipeline, preprocessing
import torch
from torch import nn, optim, utils
import lightning as L

# Vanishing and Exploding Gradients

**Vanishing gradients**: gradients get smaller and smaller until parameters in early layers get updates so small that the model effectively stops learning. When this happens the training process fails to converge to a good solution. 

**Exploding gradients**: gradients get bigger and bigger until the paramters get updates so large that the training process begins to diverge!



### Define some utility functions

The code in the cell below defines a few utility functions that will make our life easier.

In [3]:
def compute_average_loss(dataloader, loss_fn, model_fn):
    total_loss = torch.zeros(1, 1)
    for features, targets in dataloader:
        predictions = model_fn(features)        
        total_loss += loss_fn(predictions, targets)
    average_loss = total_loss / len(dataloader)
    return average_loss


def fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    log_epochs=1,
    max_epochs=1):
  
    history = {
        "epoch": [],
        "average_train_loss": [],
        "average_val_loss": []
    }

    for epoch in range(max_epochs):
        total_train_loss = torch.zeros(1, 1)
        model_fn = model_fn.train()
        for features, targets in train_dataloader:
            
            # forward pass
            predictions = model_fn(features)        
            loss = loss_fn(predictions, targets)
            total_train_loss += loss

            # backward pass
            loss.backward()
            optimizer.step()        
            optimizer.zero_grad()
        
        average_train_loss = total_train_loss / len(train_dataloader)
        history["epoch"].append(epoch)
        history["average_train_loss"].append(average_train_loss.item())

        # validation after every training epoch
        model_fn = model_fn.eval()
        with torch.inference_mode():
            average_val_loss = compute_average_loss(
                val_dataloader,
                loss_fn,
                model_fn
            )
        history["average_val_loss"].append(average_val_loss.item())


        if epoch % log_epochs == 0:
            message = f"Epoch {epoch}, Average train Loss {average_train_loss.item():.4f}, Average val Loss {average_val_loss.item():.4f}"
            print(message)

    history_df = (pd.DataFrame.from_dict(history)
                              .set_index("epoch"))
    return history_df


## Load the MNIST data

In [4]:
%%bash
cat ./sample_data/mnist_train_small.csv | head -n 5

6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,67,67,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,131,252,252,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,159,250,232,30,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,222,252,108,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147,252,183,5,0,0,0,0,0,0,0,20,89,89,73,0,0,0,0,0,0,0,0,0,0,0,0,48,247,252,159,0,0,0,0,0,0,0,79,236,252,252,249,198,16,0,0,0,0,0,0,0,0,0,41,193,252,199,22,0,0,0,0,0,12,135,248,252,252,252,252,252,100,0,0,0,0,0,0,0,0,0,100,252,252,88,0,0,0,0,0,11,171,252,252,235,175,178,252,252,224,0,0,0,0,0,0,0,0,15,209,252,233,12,0,0,0,0,49,177,252,252,89,26,0,2,166,252,252,0,0,0,0,0,0,0,0,96,253,253,59,0,0,0,0,11,177,255,253,92,0,0,0,0,155,253,128,0,0,0,0,0,0,0,0,143,252,252,10,0,0,0,12,171,252,216,110,13,0,0,0,3,180,

In [5]:
INPUT_SIZE = 784
OUTPUT_SIZE = 10

_train_data = pd.read_csv(
    "./sample_data/mnist_train_small.csv",
    header=None,
    names=["label"] + [f"p{i}" for i in range(INPUT_SIZE)],
)
train_data, val_data = model_selection.train_test_split(
    _train_data,
    test_size=0.1,
    stratify=_train_data.loc[:, "label"]
)

test_data = pd.read_csv(
    "./sample_data/mnist_test.csv",
    header=None,
    names=["label"] + [f"p{i}" for i in range(INPUT_SIZE)],
)

### Create preprocessing pipelines

In [6]:
features_preprocessor = pipeline.make_pipeline(
    preprocessing.MinMaxScaler(),
    preprocessing.FunctionTransformer(lambda arr: arr.astype(np.float32)),
    preprocessing.FunctionTransformer(lambda arr: torch.from_numpy(arr))
)

target_preprocessor = pipeline.make_pipeline(
    preprocessing.FunctionTransformer(lambda df: df.to_numpy()),
    preprocessing.FunctionTransformer(lambda arr: torch.from_numpy(arr))
)


### Create Datasets and DataLoaders

In [7]:
BATCH_SIZE = 64
NUM_WORKERS = 2

# create the training dataset and dataloader
train_features_tensor = features_preprocessor.fit_transform(
    train_data.drop("label", axis=1)
)

train_target_tensor = target_preprocessor.fit_transform(
    train_data.loc[:, "label"]
)

train_dataset = utils.data.TensorDataset(
    train_features_tensor,
    train_target_tensor
)

train_dataloader = utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# create the validation dataset and dataloader
val_features_tensor = features_preprocessor.transform(
    val_data.drop("label", axis=1)
)

val_target_tensor = target_preprocessor.transform(
    val_data.loc[:, "label"]
)

val_dataset = utils.data.TensorDataset(
    val_features_tensor,
    val_target_tensor
)

val_dataloader = utils.data.DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# create the test dataset and dataloader
test_features_tensor = features_preprocessor.transform(
    test_data.drop("label", axis=1)
)

test_target_tensor = target_preprocessor.transform(
    test_data.loc[:, "label"]
)

test_dataset = utils.data.TensorDataset(
    test_features_tensor,
    test_target_tensor
)

test_dataloader = utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)


## Vanishing gradients example

In [8]:
HIDDEN_SIZE = 100
LEARNING_RATE = 1e-2
MAX_EPOCHS = 10

model_fn = nn.Sequential(
    nn.Linear(INPUT_SIZE, HIDDEN_SIZE),
    nn.Sigmoid(),
    nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE),
    nn.Sigmoid(),
    nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE),
    nn.LogSoftmax(dim=1),
)

loss_fn = nn.NLLLoss()

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.3056, Average val Loss 2.2995
Epoch 1, Average train Loss 2.2996, Average val Loss 2.2980
Epoch 2, Average train Loss 2.2981, Average val Loss 2.2964
Epoch 3, Average train Loss 2.2965, Average val Loss 2.2948
Epoch 4, Average train Loss 2.2947, Average val Loss 2.2929
Epoch 5, Average train Loss 2.2928, Average val Loss 2.2909
Epoch 6, Average train Loss 2.2906, Average val Loss 2.2886
Epoch 7, Average train Loss 2.2881, Average val Loss 2.2859
Epoch 8, Average train Loss 2.2852, Average val Loss 2.2828
Epoch 9, Average train Loss 2.2818, Average val Loss 2.2791


Notice how the both train and val losses fail to decrease: the Sigmoid activation function has become saturated and as such the gradient is basically zero: parameters are not updating, model is not learning, so training is not making any progress.

## Better Activation Functions

In this section we will explore how different activation functions can address the problem of vanishing gradients. 

The code in the cell below defines another utility function to help us generate MLP classifiers with different activation functions.

In [9]:
def make_mlp_classifier(
    input_size,
    hidden_sizes=None,
    output_size=2,
    activation_fn=None):
    modules = []
    hidden_sizes = [] if hidden_sizes is None else hidden_sizes
    for hidden_size in hidden_sizes:
        hidden_layer = nn.Linear(input_size, hidden_size)
        modules.append(hidden_layer)
        if activation_fn is not None:
            modules.append(activation_fn)
        input_size=hidden_size
    output_layer = nn.Linear(input_size, output_size)
    modules.append(output_layer)
    modules.append(nn.LogSoftmax(dim=1))
    model_fn = nn.Sequential(*modules)
    return model_fn, nn.NLLLoss()

### ReLU

* ReLU does not suffer from vanishing gradients for positive values and is very fast to compute.

* ReLU can suffer from the problem of "dying" ReLUs if too many neurons output negative values as the ReLU will then output zero.

In [None]:
nn.ReLU?

In [10]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.ReLU()
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.2412, Average val Loss 2.1458
Epoch 1, Average train Loss 1.8716, Average val Loss 1.4871
Epoch 2, Average train Loss 1.1156, Average val Loss 0.8570
Epoch 3, Average train Loss 0.7152, Average val Loss 0.6424
Epoch 4, Average train Loss 0.5658, Average val Loss 0.5510
Epoch 5, Average train Loss 0.4905, Average val Loss 0.4991
Epoch 6, Average train Loss 0.4432, Average val Loss 0.4648
Epoch 7, Average train Loss 0.4102, Average val Loss 0.4406
Epoch 8, Average train Loss 0.3860, Average val Loss 0.4223
Epoch 9, Average train Loss 0.3674, Average val Loss 0.4082


### Leaky ReLU

* Hyperparameter controls how much the activation function "leaks".
* Having non-zero slope for negative ouputs solves the "dying ReLUs" problem.

In [11]:
nn.LeakyReLU?

In [12]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.LeakyReLU(negative_slope=0.01)
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.2452, Average val Loss 2.1518
Epoch 1, Average train Loss 1.8940, Average val Loss 1.5137
Epoch 2, Average train Loss 1.1479, Average val Loss 0.8773
Epoch 3, Average train Loss 0.7304, Average val Loss 0.6412
Epoch 4, Average train Loss 0.5649, Average val Loss 0.5421
Epoch 5, Average train Loss 0.4844, Average val Loss 0.4891
Epoch 6, Average train Loss 0.4360, Average val Loss 0.4549
Epoch 7, Average train Loss 0.4028, Average val Loss 0.4302
Epoch 8, Average train Loss 0.3782, Average val Loss 0.4113
Epoch 9, Average train Loss 0.3591, Average val Loss 0.3964


### Exercise:

There are a couple of variants of Leaky ReLU: Randomized Leaky ReLU and Parametric Leaky ReLU. Adapt the code above to train models using these variants. Plot the training and validation losses and discuss.

In [None]:
nn.RReLU?

In [None]:
nn.PReLU?

### ELU and SELU

**Exponential Linear Unit (ELU)** is *negative* when neuron outputs a negative number. Has a hyperparameter that determines the value that the function approahes when neuron outputs are large and negative.

**Scaled ELU (SELU)** often used when training MLPs as this activation function allows the network to self-normalize!

In [13]:
nn.ELU?

In [14]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.ELU(alpha=1.0)
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.0847, Average val Loss 1.7655
Epoch 1, Average train Loss 1.3292, Average val Loss 0.9811
Epoch 2, Average train Loss 0.7929, Average val Loss 0.6858
Epoch 3, Average train Loss 0.5930, Average val Loss 0.5630
Epoch 4, Average train Loss 0.4961, Average val Loss 0.4968
Epoch 5, Average train Loss 0.4395, Average val Loss 0.4563
Epoch 6, Average train Loss 0.4032, Average val Loss 0.4297
Epoch 7, Average train Loss 0.3785, Average val Loss 0.4110
Epoch 8, Average train Loss 0.3606, Average val Loss 0.3971
Epoch 9, Average train Loss 0.3468, Average val Loss 0.3861


In [15]:
nn.SELU?

In [16]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.SELU()
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 1.6407, Average val Loss 1.0191
Epoch 1, Average train Loss 0.7501, Average val Loss 0.6132
Epoch 2, Average train Loss 0.5182, Average val Loss 0.4937
Epoch 3, Average train Loss 0.4310, Average val Loss 0.4380
Epoch 4, Average train Loss 0.3854, Average val Loss 0.4063
Epoch 5, Average train Loss 0.3572, Average val Loss 0.3854
Epoch 6, Average train Loss 0.3377, Average val Loss 0.3703
Epoch 7, Average train Loss 0.3230, Average val Loss 0.3588
Epoch 8, Average train Loss 0.3113, Average val Loss 0.3495
Epoch 9, Average train Loss 0.3016, Average val Loss 0.3418


### GELU, Swish, Mish

**Gaussian Error Linear Unit (GELU)**, **Sigmoid Linear Unit (SiLU or Swish)**, and **Mish** are all smooth, non-monotonic, non-convex, ReLU variants.

The idea with these activation functions is that while the extra complexity of the functions (relative to ReLU) takes more compute time during training (and inference), the training process will converge to a good solution in fewer iterations.

In [17]:
nn.GELU?

In [18]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.GELU(approximate="none")
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.2895, Average val Loss 2.2719
Epoch 1, Average train Loss 2.2406, Average val Loss 2.1902
Epoch 2, Average train Loss 2.0386, Average val Loss 1.8033
Epoch 3, Average train Loss 1.3748, Average val Loss 1.0004
Epoch 4, Average train Loss 0.8001, Average val Loss 0.6912
Epoch 5, Average train Loss 0.5940, Average val Loss 0.5693
Epoch 6, Average train Loss 0.4955, Average val Loss 0.5059
Epoch 7, Average train Loss 0.4391, Average val Loss 0.4677
Epoch 8, Average train Loss 0.4030, Average val Loss 0.4420
Epoch 9, Average train Loss 0.3778, Average val Loss 0.4233


In [19]:
nn.SiLU?

In [20]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.SiLU()
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.2834, Average val Loss 2.2626
Epoch 1, Average train Loss 2.2278, Average val Loss 2.1775
Epoch 2, Average train Loss 2.0554, Average val Loss 1.8715
Epoch 3, Average train Loss 1.5951, Average val Loss 1.2825
Epoch 4, Average train Loss 1.0040, Average val Loss 0.8080
Epoch 5, Average train Loss 0.6879, Average val Loss 0.6329
Epoch 6, Average train Loss 0.5572, Average val Loss 0.5496
Epoch 7, Average train Loss 0.4853, Average val Loss 0.5012
Epoch 8, Average train Loss 0.4396, Average val Loss 0.4700
Epoch 9, Average train Loss 0.4083, Average val Loss 0.4485


In [23]:
nn.Mish?

In [24]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.Mish()
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.2665, Average val Loss 2.2188
Epoch 1, Average train Loss 2.1056, Average val Loss 1.9295
Epoch 2, Average train Loss 1.5795, Average val Loss 1.1802
Epoch 3, Average train Loss 0.8998, Average val Loss 0.7299
Epoch 4, Average train Loss 0.6209, Average val Loss 0.5850
Epoch 5, Average train Loss 0.5125, Average val Loss 0.5176
Epoch 6, Average train Loss 0.4538, Average val Loss 0.4770
Epoch 7, Average train Loss 0.4160, Average val Loss 0.4498
Epoch 8, Average train Loss 0.3896, Average val Loss 0.4302
Epoch 9, Average train Loss 0.3700, Average val Loss 0.4154


### Exercise:

Train a MLP with three hidden layers, each layer with 100 neurons per layer, for 10 epochs on the MNIST dataset using Stochastic Gradient Descent with a learning rate of 1e-2 for three different activations functions. Plot the training and validation loss (or accuracy) curves. Compare and contrast the differences across activation functions.

## Better Parameter Initialization Strategies

With deeper models that have more parameters, choosing the right parameter initialization strategy can be critical.

In [25]:
def initialize_linear_layer(
    in_features,
    out_features,
    init_strategy_=nn.init.kaiming_uniform_):
    linear_layer = nn.Linear(in_features, out_features)
    init_strategy_(linear_layer.weight)
    return linear_layer


In [26]:
def make_mlp_classifier(
    input_size,
    hidden_sizes=None,
    output_size=2,
    activation_fn=None,
    init_strategy_=nn.init.kaiming_uniform_):
    modules = []
    hidden_sizes = [] if hidden_sizes is None else hidden_sizes
    for hidden_size in hidden_sizes:
        hidden_layer = initialize_linear_layer(
            input_size,
            hidden_size,
            init_strategy_,
        )
        modules.append(hidden_layer)
        if activation_fn is not None:
            modules.append(activation_fn)
        input_size=hidden_size
    output_layer = initialize_linear_layer(
            input_size,
            output_size,
            init_strategy_,
    )
    modules.append(output_layer)
    modules.append(nn.LogSoftmax(dim=1))
    model_fn = nn.Sequential(*modules)
    return model_fn, nn.NLLLoss()

### Kaiming

In [27]:
nn.init.kaiming_normal_?

In [28]:
nn.init.kaiming_uniform_?

In [29]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.ReLU(),
    init_strategy_=nn.init.kaiming_uniform_
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 1.3993, Average val Loss 0.7746
Epoch 1, Average train Loss 0.5853, Average val Loss 0.5064
Epoch 2, Average train Loss 0.4283, Average val Loss 0.4255
Epoch 3, Average train Loss 0.3656, Average val Loss 0.3845
Epoch 4, Average train Loss 0.3299, Average val Loss 0.3581
Epoch 5, Average train Loss 0.3055, Average val Loss 0.3385
Epoch 6, Average train Loss 0.2869, Average val Loss 0.3228
Epoch 7, Average train Loss 0.2717, Average val Loss 0.3095
Epoch 8, Average train Loss 0.2589, Average val Loss 0.2975
Epoch 9, Average train Loss 0.2478, Average val Loss 0.2871


### Xavier

In [30]:
nn.init.xavier_normal_?

In [31]:
nn.init.xavier_uniform_?

In [32]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.ReLU(),
    init_strategy_=nn.init.xavier_normal_
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 1.7259, Average val Loss 1.0496
Epoch 1, Average train Loss 0.7400, Average val Loss 0.5965
Epoch 2, Average train Loss 0.5029, Average val Loss 0.4838
Epoch 3, Average train Loss 0.4217, Average val Loss 0.4326
Epoch 4, Average train Loss 0.3786, Average val Loss 0.4020
Epoch 5, Average train Loss 0.3504, Average val Loss 0.3805
Epoch 6, Average train Loss 0.3298, Average val Loss 0.3641
Epoch 7, Average train Loss 0.3134, Average val Loss 0.3504
Epoch 8, Average train Loss 0.2999, Average val Loss 0.3387
Epoch 9, Average train Loss 0.2882, Average val Loss 0.3285


### Exercise:

Choose the appropriate activation function and initialization strategy to implement a self-normalizing MLP. Your MLP should have three hidden layers, each layer with 100 neurons per layer. Train your MLP for 10 epochs on the MNIST dataset using Stochastic Gradient Descent with a learning rate of 1e-2 for the appropriately chose. Plot the training and validation loss (or accuracy) curves.

## Batch Normalization

In [33]:
nn.BatchNorm1d?

In [34]:
def make_mlp_classifier(
    input_size,
    hidden_sizes=None,
    output_size=2,
    activation_fn=None,
    init_strategy_=nn.init.kaiming_uniform_,
    batch_normalization=False):
    modules = []
    hidden_sizes = [] if hidden_sizes is None else hidden_sizes
    for hidden_size in hidden_sizes:
        hidden_layer = initialize_linear_layer(
            input_size,
            hidden_size,
            init_strategy_,
        )
        modules.append(hidden_layer)
        if batch_normalization:
            modules.append(nn.BatchNorm1d(hidden_size))
        if activation_fn is not None:
            modules.append(activation_fn)
        input_size=hidden_size
    output_layer = initialize_linear_layer(
            input_size,
            output_size,
            init_strategy_,
    )
    modules.append(output_layer)
    modules.append(nn.LogSoftmax(dim=1))
    model_fn = nn.Sequential(*modules)
    return model_fn, nn.NLLLoss()

In [35]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.ReLU(),
    init_strategy_=nn.init.kaiming_uniform_,
    batch_normalization=True
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 1.0466, Average val Loss 0.6011
Epoch 1, Average train Loss 0.4988, Average val Loss 0.4253
Epoch 2, Average train Loss 0.3780, Average val Loss 0.3521
Epoch 3, Average train Loss 0.3158, Average val Loss 0.3109
Epoch 4, Average train Loss 0.2756, Average val Loss 0.2832
Epoch 5, Average train Loss 0.2464, Average val Loss 0.2631
Epoch 6, Average train Loss 0.2236, Average val Loss 0.2479
Epoch 7, Average train Loss 0.2049, Average val Loss 0.2358
Epoch 8, Average train Loss 0.1892, Average val Loss 0.2261
Epoch 9, Average train Loss 0.1756, Average val Loss 0.2176


### Exercise:

Add BatchNormalization to your MLP. Your MLP should have three hidden layers, each layer with 100 neurons per layer. Train your MLP for 10 epochs on the MNIST dataset using Stochastic Gradient Descent with a learning rate of 1e-2 for the appropriately chose. Plot the training and validation loss (or accuracy) curves.

## Gradient Clipping

In [36]:
nn.utils.clip_grad_value_?

In [37]:
nn.utils.clip_grad_norm_?

In [38]:
def clip_gradients_(
    clip_grad_strategy,
    model_fn,
    clip_value=None,
    error_if_nonfinite=False,
    max_norm=None,
    norm_type=2.0):
    if clip_grad_strategy == "value" and clip_value is not None:
        nn.utils.clip_grad_value_(
            model_fn.parameters(),
            clip_value
        )
    elif clip_grad_strategy == "norm" and max_norm is not None:
        nn.utils.clip_grad_norm_(
            model_fn.parameters(),
            max_norm,
            norm_type,
            error_if_nonfinite
        )
    else:
        raise NotImplementedError()


def compute_average_loss(dataloader, loss_fn, model_fn):
    total_loss = torch.zeros(1, 1)
    for features, targets in dataloader:
        predictions = model_fn(features)        
        total_loss += loss_fn(predictions, targets)
    average_loss = total_loss / len(dataloader)
    return average_loss


def fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    clip_grad_strategy=None,
    clip_value=None,
    error_if_nonfinite=False,
    log_epochs=1,
    max_epochs=1,
    max_norm=None,
    norm_type=2.0):
  
    history = {
        "epoch": [],
        "average_train_loss": [],
        "average_val_loss": []
    }

    for epoch in range(max_epochs):
        total_train_loss = torch.zeros(1, 1)
        model_fn = model_fn.train()
        for features, targets in train_dataloader:
            
            # forward pass
            predictions = model_fn(features)        
            loss = loss_fn(predictions, targets)
            total_train_loss += loss

            # backward pass
            loss.backward()
            clip_gradients_(
                clip_grad_strategy,
                model_fn,
                clip_value,
                error_if_nonfinite,
                max_norm,
                norm_type
            )
            optimizer.step()        
            optimizer.zero_grad()
        
        average_train_loss = total_train_loss / len(train_dataloader)
        history["epoch"].append(epoch)
        history["average_train_loss"].append(average_train_loss.item())

        # validation after every training epoch
        model_fn = model_fn.eval()
        with torch.inference_mode():
            average_val_loss = compute_average_loss(
                val_dataloader,
                loss_fn,
                model_fn
            )
        history["average_val_loss"].append(average_val_loss.item())


        if epoch % log_epochs == 0:
            message = f"Epoch {epoch}, Average train Loss {average_train_loss.item():.4f}, Average val Loss {average_val_loss.item():.4f}"
            print(message)

    history_df = (pd.DataFrame.from_dict(history)
                              .set_index("epoch"))
    return history_df


In [40]:
model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.ReLU(),
    init_strategy_=nn.init.kaiming_uniform_,
    batch_normalization=False,
)

optimizer = optim.SGD(model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    clip_grad_strategy="norm",
    max_norm=1.0,
    error_if_nonfinite=True,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 1.6808, Average val Loss 1.0832
Epoch 1, Average train Loss 0.7940, Average val Loss 0.6429
Epoch 2, Average train Loss 0.5382, Average val Loss 0.5134
Epoch 3, Average train Loss 0.4442, Average val Loss 0.4538
Epoch 4, Average train Loss 0.3948, Average val Loss 0.4184
Epoch 5, Average train Loss 0.3632, Average val Loss 0.3939
Epoch 6, Average train Loss 0.3406, Average val Loss 0.3753
Epoch 7, Average train Loss 0.3230, Average val Loss 0.3603
Epoch 8, Average train Loss 0.3084, Average val Loss 0.3474
Epoch 9, Average train Loss 0.2959, Average val Loss 0.3364


### Exercise:

Create an MLP with three hidden layers, each layer with 100 neurons per layer. Train your MLP for 10 epochs on the MNIST dataset using Stochastic Gradient Descent with a learning rate of 1e-2 and clip gradients by value and then repeat the training process and clip gradients by norm. Plot the training and validation loss (or accuracy) curves. Discuss.