In [None]:
import glob
import pathlib
import pickle
import requests
import tarfile
import time

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import model_selection
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils import data
import torchinfo
import torchmetrics
from torchvision import models, transforms

# Training Deep Neural Networks using GPUs

# Data

## CIFAR-10 Dataset

The original [CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset is divided into five training batches and one test batch, each with 10000 images. The test batch contains exactly 1000 randomly-selected images from each class. The training batches contain the remaining images in random order, but some training batches may contain more images from one class than another. Between them, the training batches contain exactly 5000 images from each class.

In [None]:
CLASS_LABELS = {
    0: "airplane",
    1: "automobile",
    2: "bird",
    3: "cat",
    4: "deer",
    5: "dog",
    6: "frog",
    7: "horse",
    8: "ship",
    9: "truck"
}

### Download and extract the data

In [None]:
DATA_DIR = pathlib.Path("../data/")
RAW_DATA_DIR = DATA_DIR / "cifar-10"
URL = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"


RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

with open(RAW_DATA_DIR / "cifar-10-python.tar.gz", "wb") as f:
    response = requests.get(URL)
    f.write(response.content)

with tarfile.open(RAW_DATA_DIR / "cifar-10-python.tar.gz", "r:gz") as f:
    f.extractall(RAW_DATA_DIR)


### Load the data

We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas.

In [None]:
_data = []
_labels = []
filepaths = glob.glob("../data/cifar-10/cifar-10-batches-py/*_batch*")
for filepath in sorted(filepaths):
    with open(filepath, "rb") as f:
        _batch = pickle.load(f, encoding="latin1")
        _data.append(_batch["data"])
        _labels.extend(_batch["labels"])

# each image has 3 channels with height and width of 32 pixels
features = pd.DataFrame(
    np.vstack(_data),
    columns=[f"p{i}" for i in range(3 * 32 * 32)],
    dtype="uint8",
)
target = pd.Series(_labels, dtype="uint8", name="labels")

### Explore the data

In [None]:
features.info()

In [None]:
features.head()

In [None]:
target.head()

### Visualize the data

In [None]:
fig, axes = plt.subplots(10, 10, sharex=True, sharey=True, figsize=(15, 15))
for i in range(10):
    for j in range(10):
        m, _ = features.shape
        k = np.random.randint(m)
        img = (features.loc[k, :]
                       .to_numpy()
                       .reshape((3, 32, 32))
                       .transpose(1, 2, 0))
        _ = axes[i, j].imshow(img)
        _ = axes[i, j].set_title(CLASS_LABELS[target[k]])

fig.suptitle("Random CIFAR-10 images", x=0.5, y=1.0, fontsize=25)
fig.tight_layout()

# Creating Train, Val, and Test Data

Before we look at the data any further, we need to create a test set, put it aside, and never look at it (until we are ready to test our trainined machine learning model!). Why? We don't want our machine learning model to memorize our dataset (this is called overfitting). Instead we want a model that will generalize well (i.e., make good predictions) for inputs that it didn't see during training. To do this we hold split our dataset into training and testing datasets. The training dataset will be used to train our machine learning model(s) and the testing dataset will be used to make a final evaluation of our machine learning model(s). We also need to create a validation dataset for tuning hyperparameters and deciding when to stop training.

## If you might refresh data in the future...

...then you want to use some particular hashing function to compute the hash of a unique identifier for each observation of data and include the observation in the test set if resulting hash value is less than some fixed percentage of the maximum possible hash value for your algorithm. This way even if you fetch more data, your test set will never include data that was previously included in the training data.

In [None]:
import zlib


def in_holdout_data(identifier, test_size):
    _hash = zlib.crc32(bytes(identifier))
    return _hash & 0xffffffff < test_size * 2**32


def split_data_by_id(data, test_size, id_column):
    ids = data[id_column]
    in_holdout_set = ids.apply(lambda identifier: in_holdout_data(identifier, test_size))
    return data.loc[~in_holdout_set], data.loc[in_holdout_set]


## If this is all the data you will ever have...

...then you can just set a seed for the random number generator and then randomly split the data. Scikit-Learn has a [`model_selection`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) module that contains tools for splitting datasets. First, split the dataset into training and testing datasets. Next split the training dataset into training and validation datasets.

In [None]:
SEED = 42
SEED_GENERATOR = np.random.RandomState(SEED)


def generate_seed():
    return SEED_GENERATOR.randint(np.iinfo("uint16").max)

In [None]:
# split the dataset into training and testing data
_seed = generate_seed()
_random_state = np.random.RandomState(_seed)
_train_features, test_features, _train_target, test_target = model_selection.train_test_split(
    features,
    target,
    test_size=1e-1,
    random_state=_random_state
)

train_features, val_features, train_target, val_target = model_selection.train_test_split(
    _train_features,
    _train_target,
    test_size=1e-1,
    random_state=_random_state
)

In [None]:
train_features.info()

In [None]:
val_features.info()

In [None]:
test_features.info()

# Training a Neural Network

When working with GPUs we need to tell PyTorch which device to use when training.

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

Next we need to define the components of our training loop that we developed in this morning session.

In [None]:
def accuracy(output, target):
    return torchmetrics.functional.accuracy(output, target)


def partial_fit(model_fn, loss_fn, X_batch, y_batch, opt):
    # forward pass
    loss = loss_fn(model_fn(X_batch), y_batch)

    # back propagation
    loss.backward()
    opt.step()
    opt.zero_grad() # don't forget to reset the gradient after each batch!
    

def validate(model_fn, loss_fn, data_loader):
    with torch.no_grad():

        batch_accs = []
        batch_losses = []
        
        for X, y in data_loader:
            batch_accs.append(accuracy(model_fn(X), y))
            batch_losses.append(loss_fn(model_fn(X), y))
        
        avg_accuracy = (torch.stack(batch_accs)
                             .mean())
        avg_loss = (torch.stack(batch_losses)
                         .mean())

    return avg_accuracy, avg_loss


def fit(model_fn, loss_fn, train_data_loader, opt, lr_scheduler, val_data_loader=None, number_epochs=2):
    
    for epoch in range(number_epochs):
        # train the model
        model_fn.train()
        for X_batch, y_batch in train_data_loader:
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)
        
        # compute validation loss after each training epoch
        model_fn.eval()
        if val_data_loader is not None:
            val_acc, val_loss = validate(model_fn, loss_fn, val_data_loader)
        print(f"Training epoch: {epoch}, Validation accuracy: {val_acc}, Validation loss: {val_loss}")

        # update the learning rate
        lr_scheduler.step()

In this section we introduce a `CustomDataset` to better encapsulate data preprocessing transformations using PyTorch primitives instead of Scikit-Learn. We also reuse the `LambdaLayer` and the `WrappedDataLoader` classes from this morning session. However, instead of using the `WrappedDataLoader` to implement data preprocessing steps, we will instead use the class to send our training data batches from the CPU to the GPU during the training loop.

In [None]:
class CustomDataset(data.Dataset):
    
    def __init__(self, features, target, transforms = None):
        self._data = (features.to_numpy()
                              .reshape(-1, 3, 32, 32)
                              .transpose(0, 2, 3, 1))
        self._target = target.to_numpy()
        self._transforms = transforms
        
    def __getitem__(self, index):
        X, y = self._data[index], self._target[index]
        return (self._transforms(X), y) if self._transforms is not None else (X, y)
        
    def __len__(self):
        return len(self._data)


class LambdaLayer(nn.Module):
    
    def __init__(self, f):
        super().__init__()
        self._f = f
        
    def forward(self, X):
        return self._f(X)


class WrappedDataLoader:
    
    def __init__(self, data_loader, f):
        self._data_loader = data_loader
        self._f = f
        
    def __len__(self):
        return len(self._data_loader)
    
    def __iter__(self):
        for batch in iter(self._data_loader):
            yield self._f(*batch)


## Defining the LeNet-5 architecture

In [None]:
model_fn = nn.Sequential(
    nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=0),
    nn.Tanh(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
    nn.Tanh(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    LambdaLayer(lambda X: X.view(X.size(0), -1)),
    nn.Linear(400, 120),
    nn.Tanh(),
    nn.Linear(120, 84),
    nn.Tanh(),
    nn.Linear(84, 10)
)
_ = model_fn.to(device)

In [None]:
torchinfo.summary(model_fn, input_size=(64, 3, 32, 32))

## Train the neural network

In [None]:
# use same loss function from last time
loss_fn = F.cross_entropy

# define some preprocessing transforms (done on CPU!)
_transforms = transforms.Compose([
    transforms.ToTensor(),
])

# move the tensor from the CPU to the GPU
_to_device = lambda X, y: (X.to(device), y.to(device))

# define the datasets and dataloaders
_train_dataset = CustomDataset(train_features, train_target, _transforms)
_train_data_loader = data.DataLoader(_train_dataset, batch_size=64, shuffle=True)
train_data_loader = WrappedDataLoader(_train_data_loader, _to_device)

_val_dataset = CustomDataset(val_features, val_target, _transforms)
_val_data_loader = data.DataLoader(_val_dataset, batch_size=128, shuffle=False)
val_data_loader = WrappedDataLoader(_val_data_loader, _to_device)

_test_dataset = CustomDataset(test_features, test_target, _transforms)
_test_data_loader = data.DataLoader(_test_dataset, batch_size=128, shuffle=False)
test_data_loader = WrappedDataLoader(_test_data_loader, _to_device)

# define the optimizer and the learning rate scheduler
opt = optim.SGD(model_fn.parameters(), lr=1e-2, momentum=0.9)
lr_scheduler = optim.lr_scheduler.ExponentialLR(opt, gamma=0.9, verbose=True)

In [None]:
fit(model_fn,
    loss_fn,
    train_data_loader,
    opt,
    lr_scheduler,
    val_data_loader,
    number_epochs=10)

In [None]:
average_accuracy, average_loss = validate(model_fn, loss_fn, test_data_loader)

### Exercise: Build your own neural network

Modify the LeNet-5 archtiecture as you see fit in order to gain experience building your own neural network.

In [None]:
# insert code here!

### Exercise: Experiment with different batch sizes

Train your model for 10 epochs with different batch sizes: 1, 4, 16, 64, 256. Do you notice any patterns?

In [None]:
# insert code here!

### Exercise: Experiment with different learning rate schedulers

Train your model for 10 epochs with different batch size of 64 but experiment with different learning rate schedulers. Does one learning rate scheduler outperform the others?

In [None]:
# insert code here!

## Experimenting with different architectures

In practice, it is unlikely that you will be designing your own neural network architectures from scratch. Instead you will be starting from some pre-existing neural network architecture. The [torchvision](https://pytorch.org/vision/stable/) project contains a number of neural network architectures that have found widespread use in computer vision applications.

For the remainder of this notebook we will be using the [ResNet-18](https://arxiv.org/pdf/1512.03385.pdf) model which was developed in 2015. The ResNet family of models were designed to be trained on larger images (224 x 224) and a larger number of classes (1000) so we need to make some small modifications in order to adapt this network for our dataset.

In [None]:
models.

In [None]:
model_fn = models.resnet18(num_classes=10)
model_fn.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1,1), padding=(1,1), bias=False)
_ = model_fn.to(device)

In [None]:
torchinfo.summary(model_fn, input_size=(64, 3, 32, 32))

In [None]:
# use same loss function from last time
loss_fn = F.cross_entropy

# define some preprocessing transforms (done on CPU!)
_transforms = transforms.Compose([
    transforms.ToTensor(),
])

# move the tensor from the CPU to the GPU
_to_device = lambda X, y: (X.to(device), y.to(device))

# define the datasets and dataloaders
_train_dataset = CustomDataset(train_features, train_target, _transforms)
_train_data_loader = data.DataLoader(_train_dataset, batch_size=128, shuffle=True)
train_data_loader = WrappedDataLoader(_train_data_loader, _to_device)

_val_dataset = CustomDataset(val_features, val_target, _transforms)
_val_data_loader = data.DataLoader(_val_dataset, batch_size=256, shuffle=False)
val_data_loader = WrappedDataLoader(_val_data_loader, _to_device)

_test_dataset = CustomDataset(test_features, test_target, _transforms)
_test_data_loader = data.DataLoader(_test_dataset, batch_size=256, shuffle=False)
test_data_loader = WrappedDataLoader(_test_data_loader, _to_device)

# define the optimizer and the learning rate scheduler
opt = optim.SGD(model_fn.parameters(), lr=1e-1, momentum=0.9)
lr_scheduler = optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1, verbose=True)

In [None]:
fit(model_fn,
    loss_fn,
    train_data_loader,
    opt,
    lr_scheduler,
    val_data_loader,
    number_epochs=20)

In [None]:
average_accuracy, average_loss = validate(model_fn, loss_fn, test_data_loader)

In [None]:
average_accuracy, average_loss