In [1]:
%%bash

pip install lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection, pipeline, preprocessing
import torch
from torch import nn, optim, utils
import lightning as L

### Define some utility functions

The code in the cell below defines a few utility functions that will make our life easier.

In [3]:
def clip_gradients_(
    clip_grad_strategy,
    model_fn,
    clip_value=None,
    error_if_nonfinite=False,
    max_norm=None,
    norm_type=2.0):
    if clip_grad_strategy == "value" and clip_value is not None:
        nn.utils.clip_grad_value_(
            model_fn.parameters(),
            clip_value
        )
    elif clip_grad_strategy == "norm" and max_norm is not None:
        nn.utils.clip_grad_norm_(
            model_fn.parameters(),
            max_norm,
            norm_type,
            error_if_nonfinite
        )
    elif clip_grad_strategy is None:
        pass
    else:
        raise NotImplementedError()


def compute_average_loss(dataloader, loss_fn, model_fn):
    total_loss = torch.zeros(1, 1)
    for features, targets in dataloader:
        predictions = model_fn(features)        
        total_loss += loss_fn(predictions, targets)
    average_loss = total_loss / len(dataloader)
    return average_loss


def initialize_linear_layer(
    in_features,
    out_features,
    init_strategy_=nn.init.kaiming_uniform_):
    linear_layer = nn.Linear(in_features, out_features)
    init_strategy_(linear_layer.weight)
    return linear_layer


def make_mlp_classifier(
    input_size,
    hidden_sizes=None,
    output_size=2,
    activation_fn=None,
    init_strategy_=nn.init.kaiming_uniform_,
    batch_normalization=False):
    modules = []
    hidden_sizes = [] if hidden_sizes is None else hidden_sizes
    for hidden_size in hidden_sizes:
        hidden_layer = initialize_linear_layer(
            input_size,
            hidden_size,
            init_strategy_,
        )
        modules.append(hidden_layer)
        if batch_normalization:
            modules.append(nn.BatchNorm1d(hidden_size))
        if activation_fn is not None:
            modules.append(activation_fn)
        input_size=hidden_size
    output_layer = initialize_linear_layer(
            input_size,
            output_size,
            init_strategy_,
    )
    modules.append(output_layer)
    modules.append(nn.LogSoftmax(dim=1))
    model_fn = nn.Sequential(*modules)
    return model_fn, nn.NLLLoss()


In [4]:
def fit(
    loss_fn,
    model_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    clip_grad_strategy=None,
    clip_value=None,
    error_if_nonfinite=False,
    log_epochs=1,
    max_epochs=1,
    max_norm=None,
    norm_type=2.0):
  
    history = {
        "epoch": [],
        "average_train_loss": [],
        "average_val_loss": []
    }

    for epoch in range(max_epochs):
        total_train_loss = torch.zeros(1, 1)
        model_fn = model_fn.train()
        for features, targets in train_dataloader:
            
            # forward pass
            predictions = model_fn(features)        
            loss = loss_fn(predictions, targets)
            total_train_loss += loss

            # backward pass
            loss.backward()
            clip_gradients_(
                clip_grad_strategy,
                model_fn,
                clip_value,
                error_if_nonfinite,
                max_norm,
                norm_type
            )
            optimizer.step()        
            optimizer.zero_grad()
        
        average_train_loss = total_train_loss / len(train_dataloader)
        history["epoch"].append(epoch)
        history["average_train_loss"].append(average_train_loss.item())

        # validation after every training epoch
        model_fn = model_fn.eval()
        with torch.inference_mode():
            average_val_loss = compute_average_loss(
                val_dataloader,
                loss_fn,
                model_fn
            )
        history["average_val_loss"].append(average_val_loss.item())


        if epoch % log_epochs == 0:
            message = f"Epoch {epoch}, Average train Loss {average_train_loss.item():.4f}, Average val Loss {average_val_loss.item():.4f}"
            print(message)

    history_df = (pd.DataFrame.from_dict(history)
                              .set_index("epoch"))
    return history_df


# Transfer Learning

In this section we will train a DNN model on the MNIST dataset and then use this pre-trained model as a starting point for training a model to classify an Arabic version of the MNIST dataset.

## Load the MNIST data

In [5]:
INPUT_SIZE = 784
OUTPUT_SIZE = 10

_train_data = pd.read_csv(
    "./sample_data/mnist_train_small.csv",
    header=None,
    names=["label"] + [f"p{i}" for i in range(INPUT_SIZE)],
)
mnist_train_data, mnist_val_data = model_selection.train_test_split(
    _train_data,
    test_size=0.1,
    stratify=_train_data.loc[:, "label"]
)

mnist_test_data = pd.read_csv(
    "./sample_data/mnist_test.csv",
    header=None,
    names=["label"] + [f"p{i}" for i in range(INPUT_SIZE)],
)

### Create preprocessing pipelines

In [6]:
features_preprocessor = pipeline.make_pipeline(
    preprocessing.MinMaxScaler(),
    preprocessing.FunctionTransformer(lambda arr: arr.astype(np.float32)),
    preprocessing.FunctionTransformer(lambda arr: torch.from_numpy(arr))
)

target_preprocessor = pipeline.make_pipeline(
    preprocessing.FunctionTransformer(lambda df: df.to_numpy()),
    preprocessing.FunctionTransformer(lambda arr: torch.from_numpy(arr))
)


### Create Datasets and DataLoaders

In [7]:
BATCH_SIZE = 64
NUM_WORKERS = 2

# create the training dataset and dataloader
_train_features_tensor = features_preprocessor.fit_transform(
    mnist_train_data.drop("label", axis=1)
)

_train_target_tensor = target_preprocessor.fit_transform(
    mnist_train_data.loc[:, "label"]
)

_train_dataset = utils.data.TensorDataset(
    _train_features_tensor,
    _train_target_tensor
)

mnist_train_dataloader = utils.data.DataLoader(
    _train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# create the validation dataset and dataloader
_val_features_tensor = features_preprocessor.transform(
    mnist_val_data.drop("label", axis=1)
)

_val_target_tensor = target_preprocessor.transform(
    mnist_val_data.loc[:, "label"]
)

_val_dataset = utils.data.TensorDataset(
    _val_features_tensor,
    _val_target_tensor
)

mnist_val_dataloader = utils.data.DataLoader(
    _val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# create the test dataset and dataloader
_test_features_tensor = features_preprocessor.transform(
    mnist_test_data.drop("label", axis=1)
)

_test_target_tensor = target_preprocessor.transform(
    mnist_test_data.loc[:, "label"]
)

_test_dataset = utils.data.TensorDataset(
    _test_features_tensor,
    _test_target_tensor
)

mnist_test_dataloader = utils.data.DataLoader(
    _test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)


## Train a DNN on the MNIST data

In [9]:
HIDDEN_SIZE = 100
LEARNING_RATE = 1e-2
MAX_EPOCHS = 20

pretrained_mnist_model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.SELU(),
    init_strategy_=nn.init.xavier_normal_
)

optimizer = optim.SGD(pretrained_mnist_model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    pretrained_mnist_model_fn,
    optimizer,
    mnist_train_dataloader,
    mnist_val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 0.7595, Average val Loss 0.4884
Epoch 1, Average train Loss 0.3964, Average val Loss 0.4094
Epoch 2, Average train Loss 0.3372, Average val Loss 0.3775
Epoch 3, Average train Loss 0.3066, Average val Loss 0.3588
Epoch 4, Average train Loss 0.2859, Average val Loss 0.3458
Epoch 5, Average train Loss 0.2700, Average val Loss 0.3359
Epoch 6, Average train Loss 0.2569, Average val Loss 0.3281
Epoch 7, Average train Loss 0.2455, Average val Loss 0.3213
Epoch 8, Average train Loss 0.2354, Average val Loss 0.3156
Epoch 9, Average train Loss 0.2262, Average val Loss 0.3101
Epoch 10, Average train Loss 0.2176, Average val Loss 0.3051
Epoch 11, Average train Loss 0.2096, Average val Loss 0.3005
Epoch 12, Average train Loss 0.2021, Average val Loss 0.2961
Epoch 13, Average train Loss 0.1950, Average val Loss 0.2920
Epoch 14, Average train Loss 0.1883, Average val Loss 0.2880
Epoch 15, Average train Loss 0.1819, Average val Loss 0.2845
Epoch 16, Average train Loss 0.175

## Download Arabic Handwritten Digits Data

Here we will download only the test dataset on 10k images. All images have the same size 28x28 = 784 pixels as the orginal MNIST data; there are also the same number of classes in this dataset.

In [None]:
# TODO INSERT CODE HERE!

In [10]:
_test_features = pd.read_csv(
    "/content/csvTestImages 10k x 784.csv",
    header=None,
    names=[f"p{i}" for i in range(INPUT_SIZE)],
)

_test_target = pd.read_csv(
    "/content/csvTestLabel 10k x 1.csv",
    header=None,
    names=["label"],
)
_splits = model_selection.train_test_split(
    _test_features,
    _test_target,
    test_size=0.1,
    shuffle=True,
    stratify=_test_target.loc[:, "label"]
)

arabic_mnist_train_features, arabic_mnist_val_features = _splits[:2]
arabic_mnist_train_target, arabic_mnist_val_target = _splits[2:]

In [11]:
# create the training dataset and dataloader
_train_features_tensor = features_preprocessor.fit_transform(
    arabic_mnist_train_features
)

_train_target_tensor = target_preprocessor.fit_transform(
    arabic_mnist_train_target.loc[:, "label"]
)

_train_dataset = utils.data.TensorDataset(
    _train_features_tensor,
    _train_target_tensor
)

arabic_mnist_train_dataloader = utils.data.DataLoader(
    _train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# create the validation dataset and dataloader
_val_features_tensor = features_preprocessor.transform(
    arabic_mnist_val_features)

_val_target_tensor = target_preprocessor.transform(
    arabic_mnist_val_target.loc[:, "label"]
)

_val_dataset = utils.data.TensorDataset(
    _val_features_tensor,
    _val_target_tensor
)

arabic_mnist_val_dataloader = utils.data.DataLoader(
    _val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

### Fine-tune the MNIST DNN

Since Arabic MNIST images have the exact same size as the original MNIST images, and the number of classes is also the same, we can just take the pre-trained MNIST classifer and fine-tune it using Arabic MNIST images. 

In [12]:
arabic_mnist_model_fn, loss_fn = make_mlp_classifier(
    input_size=INPUT_SIZE, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.SELU(),
    init_strategy_=nn.init.xavier_normal_
)

# initialize model weights using MNIST DNN weights
_pretrained_state_dict = pretrained_mnist_model_fn.state_dict()
arabic_mnist_model_fn.load_state_dict(_pretrained_state_dict)

<All keys matched successfully>

In [13]:
optimizer = optim.SGD(arabic_mnist_model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    arabic_mnist_model_fn,
    optimizer,
    arabic_mnist_train_dataloader,
    arabic_mnist_val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 0.9494, Average val Loss 0.4363
Epoch 1, Average train Loss 0.2840, Average val Loss 0.3166
Epoch 2, Average train Loss 0.2092, Average val Loss 0.2694
Epoch 3, Average train Loss 0.1729, Average val Loss 0.2433
Epoch 4, Average train Loss 0.1505, Average val Loss 0.2265
Epoch 5, Average train Loss 0.1349, Average val Loss 0.2146
Epoch 6, Average train Loss 0.1231, Average val Loss 0.2058
Epoch 7, Average train Loss 0.1139, Average val Loss 0.1989
Epoch 8, Average train Loss 0.1064, Average val Loss 0.1934
Epoch 9, Average train Loss 0.1000, Average val Loss 0.1889
Epoch 10, Average train Loss 0.0945, Average val Loss 0.1852
Epoch 11, Average train Loss 0.0897, Average val Loss 0.1821
Epoch 12, Average train Loss 0.0854, Average val Loss 0.1795
Epoch 13, Average train Loss 0.0816, Average val Loss 0.1773
Epoch 14, Average train Loss 0.0781, Average val Loss 0.1754
Epoch 15, Average train Loss 0.0749, Average val Loss 0.1737
Epoch 16, Average train Loss 0.072

### Exercise:

Compare the performance of the original MNIST model at predicting the Arabic Handwritten digits with the performance of the Arabic MNIST model that was fine-tuned on the Arabic MNIST data.

# Unsupervised pre-training

Often you will have a large amount of unlabeled data and a small amount of labeled data. In this situation, one possible solution is to use unsupervised pre-training.

In [19]:
arabic_mnist_unlabeled_features = pd.read_csv(
    "/content/csvTrainImages 60k x 784.csv",
    header=None,
    names=[f"p{i}" for i in range(INPUT_SIZE)],
)


## Generate an embedding

When applying unsupervised pre-training you first need to find a "good" embedding of your unlabeled features. For computer vision applications you would want to use more powerful models such as autoencoders, generative adversarial models (GANs) or similar to generate your embeddings. Here we just use PCA.

In [20]:
from sklearn import decomposition

In [21]:
encoder_pipeline = pipeline.make_pipeline(
    decomposition.PCA(n_components=0.95),
)

In [22]:
feature_embedding = encoder_pipeline.fit_transform(arabic_mnist_unlabeled_features)

In [23]:
_, input_size = feature_embedding.shape

How do we define a "good" embedding? Reconstruction error! Once we have a feature embedding, we can invert the embedding transformation in order to reconstruct the original features. Once we have a reconstruction of the original features we can estimate the reconstruction error by comparing the original features and the reconstructed features. 

A good embedding will have a small reconstruction error.

In [24]:
from sklearn import metrics

In [28]:
# here we estimate the reconstruction error using mean squared error
reconstructed_features = encoder_pipeline.inverse_transform(feature_embedding)
metrics.mean_squared_error(
    arabic_mnist_unlabeled_features,
    reconstructed_features
)

192.25560211574674

### Exercise:

How could we improve the performance of our feature encoder pipeline? What impact will improving the performance of the feature encoder pipeline have on the performance of the DNN trained on the encoded features?

## Use encoder pipeline to embed your labeled data

Hopefully our "good" embedding of the unlabeled features has learned useful information for our supervised classification task. Now we use the trained encoder pipeline to embed our labeled features.

In [29]:
# create the training dataset and dataloader
_embedded_train_features = encoder_pipeline.transform(arabic_mnist_train_features)
_train_features_tensor = features_preprocessor.fit_transform(
    _embedded_train_features
)

_train_target_tensor = target_preprocessor.fit_transform(
    arabic_mnist_train_target.loc[:, "label"]
)

_train_dataset = utils.data.TensorDataset(
    _train_features_tensor,
    _train_target_tensor
)

embedded_train_dataloader = utils.data.DataLoader(
    _train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

# create the validation dataset and dataloader
_embedded_val_features = encoder_pipeline.transform(arabic_mnist_val_features)
_val_features_tensor = features_preprocessor.transform(
    _embedded_val_features)

_val_target_tensor = target_preprocessor.transform(
    arabic_mnist_val_target.loc[:, "label"]
)

_val_dataset = utils.data.TensorDataset(
    _val_features_tensor,
    _val_target_tensor
)

embedded_val_dataloader = utils.data.DataLoader(
    _val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

### Train a DNN using the embedded features

In [31]:
unsupervised_pretraining_model_fn, loss_fn = make_mlp_classifier(
    input_size=input_size, 
    hidden_sizes=[HIDDEN_SIZE, HIDDEN_SIZE, HIDDEN_SIZE], 
    output_size=OUTPUT_SIZE, 
    activation_fn=nn.SELU(),
    init_strategy_=nn.init.xavier_normal_
)

optimizer = optim.SGD(unsupervised_pretraining_model_fn.parameters(), lr=LEARNING_RATE)

history = fit(
    loss_fn,
    unsupervised_pretraining_model_fn,
    optimizer,
    embedded_train_dataloader,
    embedded_val_dataloader,
    max_epochs=MAX_EPOCHS
)

Epoch 0, Average train Loss 2.2518, Average val Loss 2.1412
Epoch 1, Average train Loss 2.0201, Average val Loss 1.9120
Epoch 2, Average train Loss 1.7352, Average val Loss 1.6060
Epoch 3, Average train Loss 1.4126, Average val Loss 1.2500
Epoch 4, Average train Loss 1.1201, Average val Loss 0.9848
Epoch 5, Average train Loss 0.8801, Average val Loss 0.7840
Epoch 6, Average train Loss 0.6943, Average val Loss 0.6362
Epoch 7, Average train Loss 0.5602, Average val Loss 0.5313
Epoch 8, Average train Loss 0.4649, Average val Loss 0.4524
Epoch 9, Average train Loss 0.3998, Average val Loss 0.3968
Epoch 10, Average train Loss 0.3530, Average val Loss 0.3547
Epoch 11, Average train Loss 0.3166, Average val Loss 0.3246
Epoch 12, Average train Loss 0.2880, Average val Loss 0.3025
Epoch 13, Average train Loss 0.2632, Average val Loss 0.2850
Epoch 14, Average train Loss 0.2429, Average val Loss 0.2689
Epoch 15, Average train Loss 0.2249, Average val Loss 0.2563
Epoch 16, Average train Loss 0.211

### Exercise:

Compare the performance of the transfer learning and unsupervised pre-training approaches to classifying the Arabic Handwritten Digits images.