In [None]:
import glob
import pathlib
import pickle
import requests
import tarfile
import time

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import ensemble, model_selection, preprocessing
import torch
import torchinfo
import torchvision


# Introduction

* Tutorial materials are derived from [_What is torch.nn really?_](https://pytorch.org/tutorials/beginner/nn_tutorial.html) by Jeremy Howard, Rachel Thomas, Francisco Ingham.

## CIFAR-10 Dataset

The original [CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar.html) dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset is divided into five training batches and one test batch, each with 10000 images. The test batch contains exactly 1000 randomly-selected images from each class. The training batches contain the remaining images in random order, but some training batches may contain more images from one class than another. Between them, the training batches contain exactly 5000 images from each class.

In [None]:
CLASS_LABELS = {
    0: "airplane",
    1: "automobile",
    2: "bird",
    3: "cat",
    4: "deer",
    5: "dog",
    6: "frog",
    7: "horse",
    8: "ship",
    9: "truck"
}

### Download and extract the data

In [None]:
DATA_DIR = pathlib.Path("../data/")
RAW_DATA_DIR = DATA_DIR / "cifar-10"
URL = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"


RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

with open(RAW_DATA_DIR / "cifar-10-python.tar.gz", "wb") as f:
    response = requests.get(URL)
    f.write(response.content)

with tarfile.open(RAW_DATA_DIR / "cifar-10-python.tar.gz", "r:gz") as f:
    f.extractall(RAW_DATA_DIR)


### Load the data

We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas.

In [None]:
data = []
labels = []
filepaths = glob.glob("../data/cifar-10/cifar-10-batches-py/*_batch*")
for filepath in sorted(filepaths):
    with open(filepath, "rb") as f:
        batch = pickle.load(f, encoding="latin1")
        data.append(batch["data"])
        labels.extend(batch["labels"])

# each image has 3 channels with height and width of 32 pixels
features = pd.DataFrame(
    np.vstack(data),
    columns=[f"p{i}" for i in range(3 * 32 * 32)],
    dtype="uint8",
)
target = pd.Series(labels, dtype="uint8", name="labels")

### Explore the data

In [None]:
features.info()

In [None]:
features.head()

In [None]:
target.head()

### Visualize the data

In [None]:
fig, axes = plt.subplots(10, 10, sharex=True, sharey=True, figsize=(15, 15))
for i in range(10):
    for j in range(10):
        m, _ = features.shape
        k = np.random.randint(m)
        img = (features.loc[k, :]
                       .to_numpy()
                       .reshape((3, 32, 32))
                       .transpose(1, 2, 0))
        _ = axes[i, j].imshow(img)
        _ = axes[i, j].set_title(CLASS_LABELS[target[k]])

fig.suptitle("Random CIFAR-10 images", x=0.5, y=1.0, fontsize=25)
fig.tight_layout()

# Creating a Test Dataset

Before we look at the data any further, we need to create a test set, put it aside, and never look at it (until we are ready to test our trainined machine learning model!). Why? We don't want our machine learning model to memorize our dataset (this is called overfitting). Instead we want a model that will generalize well (i.e., make good predictions) for inputs that it didn't see during training. To do this we hold split our dataset into training and testing datasets. The training dataset will be used to train our machine learning model(s) and the testing dataset will be used to make a final evaluation of our machine learning model(s).

## If you might refresh data in the future...

...then you want to use some particular hashing function to compute the hash of a unique identifier for each observation of data and include the observation in the test set if resulting hash value is less than some fixed percentage of the maximum possible hash value for your algorithm. This way even if you fetch more data, your test set will never include data that was previously included in the training data.

In [None]:
import zlib


def in_testing_data(identifier, test_size):
    _hash = zlib.crc32(bytes(identifier))
    return _hash & 0xffffffff < test_size * 2**32


def split_train_test_by_id(data, test_size, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda identifier: in_testing_data(identifier, test_size))
    return data.loc[~in_test_set], data.loc[in_test_set]


## If this is all the data you will ever have...

...then you can just set a seed for the random number generator and then randomly split the data. Scikit-Learn has a [`model_selection`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) module that contains tools for splitting datasets into training and testing sets.

In [None]:
SEED = 42
SEED_GENERATOR = np.random.RandomState(SEED)


def generate_seed():
    return SEED_GENERATOR.randint(np.iinfo("uint16").max)

In [None]:
# split the dataset into training and testing data
_seed = generate_seed()
_random_state = np.random.RandomState(_seed)
train_features, test_features, train_target, test_target = model_selection.train_test_split(
    features,
    target,
    test_size=1e-1,
    random_state=_random_state
)

In [None]:
train_features.info()

In [None]:
test_features.info()

## Feature scaling

Data for individual pixels is stored as integers between 0 and 255. Neural network models work best when numerical features are scaled. To rescale the raw features we can use tools from the [Scikit-Learn preprocessing module](https://scikit-learn.org/stable/modules/preprocessing.html).

In [None]:
# hyper-parameters
_min_max_scaler_hyperparameters = {
    "feature_range": (0, 1),
}

preprocessor = preprocessing.MinMaxScaler(**_min_max_scaler_hyperparameters)

In [None]:
preprocessed_train_features = (preprocessor.fit_transform(train_features)
                                           .astype("float32"))
preprocessed_train_target = (train_target.to_numpy()
                                         .astype("int64"))

# Classical ML Benchmark Model

We have several of these from yesterday!

In [None]:
_seed = generate_seed()
_estimator_hyperpararmeters = {
    "bootstrap": True,
    "oob_score": True,
    "max_samples": 0.9,
    "random_state": np.random.RandomState(_seed),
}
estimator = ensemble.RandomForestClassifier(**_estimator_hyperpararmeters)

In [None]:
_ = estimator.fit(preprocessed_train_features, preprocessed_train_target)

In [None]:
estimator.oob_score_

# Neural network from scratch

## Split the training data into training and validation sets

In [None]:
_seed = generate_seed()
_random_state = np.random.RandomState(_seed)
preprocessed_train_features, preprocessed_val_features, preprocessed_train_target, preprocessed_val_target = (
    model_selection.train_test_split(preprocessed_train_features,
                                     preprocessed_train_target,
                                     test_size=1e-1,
                                     random_state=_random_state)
)

In [None]:
preprocessed_train_features.shape

In [None]:
preprocessed_val_features.shape


Next let's create a simple model using nothing but [PyTorch tensor operations](https://pytorch.org/docs/stable/tensors.html). PyTorch uses `torch.tensor` rather than `numpy.ndarray` so we need to convert data.

In [None]:
preprocessed_train_target = torch.from_numpy(preprocessed_train_target)
preprocessed_train_features = torch.from_numpy(preprocessed_train_features)

preprocessed_val_target = torch.from_numpy(preprocessed_val_target)
preprocessed_val_features = torch.from_numpy(preprocessed_val_features)

In [None]:
preprocessed_train_features

In [None]:
preprocessed_train_target

PyTorch provides methods to create random or zero-filled tensors, which we will use to create our weights and bias for a simple linear model. These are just regular tensors, with one very special addition: we tell PyTorch that they require a gradient. This causes PyTorch to record all of the operations done on the tensor, so that it can calculate the gradient during back-propagation automatically!

For the weights, we set `requires_grad` after the initialization, since we don’t want that step included in the gradient. (Note that a trailling `_` in PyTorch signifies that the operation is performed _in-place_.)

In [None]:
number_samples, number_features = preprocessed_train_features.shape

# using Xavier initialization (divide weights by sqrt(number_features))
weights = torch.randn(number_features, 10) / number_features**0.5
weights.requires_grad_() # trailing underscore indicates in-place operation
bias = torch.zeros(10, requires_grad=True)

Thanks to PyTorch’s ability to calculate gradients automatically, we can use any standard Python function (or callable object) in a model! So we will start by writing a function to peform matrix multiplication and broadcasted addition called `linear_transformation`. We will also need an activation function, so we’ll write a function called `log_softmax_activation` and use it. 

**N.B.** Although PyTorch provides lots of pre-written loss functions, activation functions, and so forth, you can easily write your own using plain python. PyTorch will even create fast GPU or vectorized CPU code for your function automatically.

In [None]:
def linear_transformation(X):
    return X @ weights + bias

def log_softmax_activation(X):
    return X - X.exp().sum(-1).log().unsqueeze(-1)
    
def logistic_regression(X):
    Z = linear_transformation(X)
    return log_softmax_activation(Z)

In the above, the `@` stands for the dot product operation. We will call our function on one batch of data (in this case, 64 images). Note that our predictions won’t be any better than random at this stage, since we start with random weights.

In [None]:
batch_size = 64
output = logistic_regression(preprocessed_train_features[:batch_size])

In [None]:
output[1]

As you see, the `output` tensor contains not only the tensor values, but also a gradient function, `grad_fn`. We’ll use this later to do back propagation to update the model parameters.

Let’s implement `negative_log_likelihood` to use as the loss function. Again, we can just use standard Python code.

In [None]:
def negative_log_likelihood(output, target):
    m, _ = output.shape
    return -output[range(m), target].mean()
    

In [None]:
negative_log_likelihood(output, preprocessed_train_target[:batch_size])

Let’s also implement a function to calculate the `accuracy` of our model: for each prediction, if the index with the largest value matches the target value, then the prediction was correct.

In [None]:
def accuracy(output, target):
    predictions = torch.argmax(output, dim=1)
    return (predictions == target).float().mean()

For comparison purposes we can compute the accuracy of our model with randomly initialized parameters.

In [None]:
accuracy(output, preprocessed_train_target[:batch_size])

We can now run a training loop. For each iteration, we will:

* select a mini-batch of data (of size `batch_size`)
* use the model to make predictions
* calculate the loss
* `loss.backward()` updates the gradients of the model.

We now use these gradients to update the weights and bias (i.e., model parameters). We do this within the `torch.no_grad()` context manager, because we do not want these actions to be recorded for our next calculation of the gradient. You can read more about how PyTorch’s Autograd records operations [here](https://pytorch.org/docs/stable/notes/autograd.html).

We then set the gradients to zero, so that we are ready for the next loop. Otherwise, our gradients would record a running tally of all the operations that had happened (i.e. loss.backward() adds the gradients to whatever is already stored, rather than replacing them).

In [None]:
model_fn = logistic_regression
loss_fn = negative_log_likelihood

number_epochs = 15
number_batches = (number_samples - 1) // batch_size + 1

learning_rate = 1e-2
for epoch in range(number_epochs):
    for batch in range(number_batches):
        
        # forward pass
        start = batch * batch_size
        X = preprocessed_train_features[start:(start + batch_size)]
        y = preprocessed_train_target[start:(start + batch_size)]
        loss = loss_fn(model_fn(X), y)
        
        # back propagation
        loss.backward()
        with torch.no_grad():
            weights -= learning_rate * weights.grad
            bias -= learning_rate * bias.grad
            weights.grad.zero_()
            bias.grad.zero_()
            

That’s it: we’ve created and trained a minimal neural network (in this case, a logistic regression, since we have no hidden layers) entirely from scratch! Let’s check the loss and accuracy and compare those to what we got earlier. We expect that the loss will have decreased and accuracy to have increased, and they have.

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

# Refactor using `torch.nn.functional`

We will now refactor our code using [torch.nn](https://pytorch.org/docs/stable/nn.html) modules to make it more concise and flexible. The first and easiest step is to make our code shorter by replacing our hand-written activation and loss functions with those from [torch.nn.functional](https://pytorch.org/docs/stable/nn.html#torch-nn-functional).

Since we are using negative log likelihood loss and log softmax activation in this tutorial, we can use [torch.nn.functional.cross_entropy](https://pytorch.org/docs/stable/nn.html#cross-entropy) which combines the two.

In [None]:
import torch.nn.functional as F

In [None]:
Z = linear_transformation(preprocessed_train_features)
F.cross_entropy(Z, preprocessed_train_target)

# Refactor using `torch.nn.Module`

Next up, we’ll use [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#module) and [torch.nn.Parameter](https://pytorch.org/docs/stable/nn.html#parameters), for a clearer and more concise training loop. In this case, we want to create a class that holds our weights, bias, and method for the forward step. `torch.nn.Module` has a number of attributes and methods (such as `parameters()` and `zero_grad()`) which we will be using.

In [None]:
from torch import nn


class LogisticRegression(nn.Module):
    
    def __init__(self):
        super().__init__()
        self._weights = nn.Parameter(torch.randn(3 * 32 * 32, 10) / (3 * 32 * 32)**0.5)
        self._bias = nn.Parameter(torch.zeros(10))
        
    def forward(self, X):
        return X @ self._weights + self._bias
    


Since we’re now using an object instead of just using a function, we first have to instantiate our model.

In [None]:
model_fn = LogisticRegression()

Now we can calculate the loss in the same way as before. Note that `torch.nn.Module` objects are used as if they are functions (i.e they are callable), but behind the scenes Pytorch will call the `forward` method.

In [None]:
F.cross_entropy(model_fn(preprocessed_train_features), preprocessed_train_target)

Previously in our training loop we had to update the values for each parameter by name and manually zero out the grads for each parameter separately.  With our refactoring we can take advantage of `model_fn.parameters()` and `model_fn.zero_grad()` (which are both defined by PyTorch for `torch.nn.Module` base class!) to make those steps more concise and less prone to the error of forgetting some of our parameters, particularly if we had a more complicated model.

In order to facilitate re-use and continued refactoring, we can encapsulate the logic of our deep learning pipeline in the following functions. 

In [None]:
def partial_fit(model_fn, loss_fn, learning_rate, X_batch, y_batch):
    # forward pass
    loss = loss_fn(model_fn(X_batch), y_batch)

    # back propagation
    loss.backward()
    with torch.no_grad():
        for parameter in model_fn.parameters():
            parameter -= learning_rate * parameter.grad
        model_fn.zero_grad()


def fit(model_fn, loss_fn, X, y, learning_rate=1e-2, number_epochs=2, batch_size=64):
    number_samples, _ = X.shape 
    number_batches = (number_samples - 1) // batch_size + 1
    for epoch in range(number_epochs):
        for batch in range(number_batches):
            start = batch * batch_size
            X_batch = X[start:(start + batch_size)]
            y_batch = y[start:(start + batch_size)]
            partial_fit(model_fn, loss_fn, learning_rate, X_batch, y_batch)

In [None]:
model_fn = LogisticRegression()
loss_fn = F.cross_entropy

In [None]:
fit(model_fn, loss_fn, preprocessed_train_features, preprocessed_train_target, number_epochs=2)

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

# Refactoring using `torch.nn.Linear`

Instead of defining and initializing `self._weights` and `self._bias`, and calculating `X  @ self._weights + self._bias`, we will instead use the Pytorch class [torch.nn.Linear](https://pytorch.org/docs/stable/nn.html#linear) to define a linear layer which does all that for us. Pytorch has many types of predefined layers that can greatly simplify our code, and since the library code is highly optimized using PyTorch's predefined layers often makes our code faster too.

In [None]:
from torch import nn


class LogisticRegression(nn.Module):
    
    def __init__(self):
        super().__init__()
        self._linear_layer = nn.Linear(3 * 32 * 32, 10)
        
    def forward(self, X):
        return self._linear_layer(X)
    


In [None]:
model_fn = LogisticRegression()
loss_fn = F.cross_entropy

In [None]:
fit(model_fn, loss_fn, preprocessed_train_features, preprocessed_train_target, number_epochs=15)

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

# Refactoring using `torch.optim`

Pytorch also has a package with various optimization algorithms, [torch.optim](https://pytorch.org/docs/stable/optim.html). We can use the step method from our optimizer to take a forward step, instead of manually updating each parameter. Also note that now the `learning_rate` is a parameter of the optimizer and we do not need to manually pass it as an argument to the `fit` and `partial_fit` functions.

In [None]:
from torch import optim

In [None]:
def partial_fit(model_fn, loss_fn, X_batch, y_batch, opt):
    # forward pass
    loss = loss_fn(model_fn(X_batch), y_batch)

    # back propagation
    loss.backward()
    opt.step()
    opt.zero_grad() # don't forget to reset the gradient after each batch!

        
def fit(model_fn, loss_fn, X, y, opt, number_epochs=2, batch_size=64):
    number_samples, _ = X.shape 
    number_batches = (number_samples - 1) // batch_size + 1
    for epoch in range(number_epochs):
        for batch in range(number_batches):
            start = batch * batch_size
            X_batch = X[start:(start + batch_size)]
            y_batch = y[start:(start + batch_size)]
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)

In [None]:
model_fn = LogisticRegression()
loss_fn = F.cross_entropy
opt = optim.SGD(model_fn.parameters(), lr=1e-2)

In [None]:
fit(model_fn, loss_fn, preprocessed_train_features, preprocessed_train_target, opt, number_epochs=15)

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

# Refactor using `torch.utils.data.TensorDataSet`

The [torch.utils.data](https://pytorch.org/docs/stable/data.html#module-torch.utils.data) module contains a number of useful classes that we can use to further simplify our code. PyTorch has an abstract `Dataset` class. A Dataset can be anything that has a `__len__` function (called by Python’s standard `len` function) and a `__getitem__` function as a way of indexing into it.

PyTorch’s `TensorDataset` is a `Dataset` wrapping tensors. By defining a length and way of indexing, this also gives us a way to iterate, index, and slice along the first dimension of a tensor. This will make it easier to access both the independent and dependent variables in the same line as we train.


In [None]:
from torch.utils import data

In [None]:
def fit(model_fn, loss_fn, data_set, number_samples, opt, number_epochs=2, batch_size=64):
    number_batches = (number_samples - 1) // batch_size + 1
    for epoch in range(number_epochs):
        for batch in range(number_batches):
            start = batch * batch_size
            X_batch, y_batch = data_set[start:(start + batch_size)]
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)

In [None]:
model_fn = LogisticRegression()

In [None]:
torchinfo.summary(model_fn)

In [None]:
loss_fn = F.cross_entropy
train_dataset = data.TensorDataset(preprocessed_train_features, preprocessed_train_target)
opt = optim.SGD(model_fn.parameters(), lr=1e-2)

In [None]:
# note the annoying dependence on number of samples!
fit(model_fn, loss_fn, train_dataset, number_samples, opt, number_epochs=15)

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

# Refactor using `torch.utils.data.DataLoader`

Pytorch’s `DataLoader` is responsible for managing batches. You can create a `DataLoader` from any `Dataset`. `DataLoader` makes it easier to iterate over batches. Rather than having to use `data_set[start:(start + batch_size)]`, the `DataLoader` gives us each minibatch automatically.

In [None]:
data.DataLoader?

In [None]:
def fit(model_fn, loss_fn, data_loader, opt, number_epochs=2):
    for epoch in range(number_epochs):
        for X_batch, y_batch in data_loader:
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)

In [None]:
model_fn = LogisticRegression()
loss_fn = F.cross_entropy
train_data_loader = data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, shuffle=True)
opt = optim.SGD(model_fn.parameters(), lr=1e-2)

In [None]:
# now we no longer have the annoying dependency on number of samples!
fit(model_fn, loss_fn, train_data_loader, opt)

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

Thanks to Pytorch’s `torch.nn.Module`, `torch.nn.Parameter`, `Dataset`, and `DataLoader`, our training loop is now dramatically smaller and easier to understand. Let’s now try to add the basic features necessary to create effecive models in practice.

# Adding Validation

In the first part of this tutorial, we were just trying to get a reasonable training loop set up for use on our training data. In reality, you always should also have a validation set, in order to identify if you are overfitting.

Shuffling the training data is important to prevent correlation between batches and overfitting. On the other hand, the validation loss will be identical whether we shuffle the validation set or not. Since shuffling takes extra time, it makes no sense to shuffle the validation data.

We’ll use a batch size for the validation set that is twice as large as that for the training set. This is because the validation set does not need backpropagation and thus takes less memory (it doesn’t need to store the gradients). We take advantage of this to use a larger batch size and compute the loss more quickly.

In [None]:
def validate(epoch, model_fn, loss_fn, val_data_loader):
    model_fn.eval()
    with torch.no_grad():
        batch_losses, batch_sizes = zip(*[(loss_fn(model_fn(X), y), len(X)) for X, y in val_data_loader])
        val_loss = np.sum(np.multiply(batch_losses, batch_sizes)) / np.sum(batch_sizes)
        print(f"Training epoch: {epoch}, Validation loss: {val_loss}")


def fit(model_fn, loss_fn, train_data_loader, opt, val_data_loader=None, number_epochs=2):
    
    for epoch in range(number_epochs):
        model_fn.train()
        for X_batch, y_batch in train_data_loader:
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)
        
        # compute validation loss after each training epoch
        if val_data_loader is not None:
            validate(epoch, model_fn, loss_fn, val_data_loader)

In [None]:
model_fn = LogisticRegression()
loss_fn = F.cross_entropy
train_data_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
opt = optim.SGD(model_fn.parameters(), lr=1e-2)

val_dataset = data.TensorDataset(preprocessed_val_features, preprocessed_val_target)
val_data_loader = data.DataLoader(val_dataset, batch_size=64)

In [None]:
fit(model_fn, loss_fn, train_data_loader, opt, val_data_loader, number_epochs=2)

In [None]:
training_loss = loss_fn(model_fn(preprocessed_train_features), preprocessed_train_target)
training_accuracy = accuracy(model_fn(preprocessed_train_features), preprocessed_train_target)

print(f"Training loss: {training_loss}")
print(f"Training accuracy: {training_accuracy}")

### Exercise: Logging Accuracy during Validation

Make the necessary changes to the `validation` function so that you log out your model's accuracy on the validation data after every epoch.

In [None]:
# insert code here!

### Exercise: Logging Accuracy during Validation

Train your model for 15-20 epochs. Do you think the model is overfitting or underfitting? Why?

In [None]:
# insert code here!

# Switching to CNN

We are now going to build our neural network with three convolutional-subsampling layers. Because none of the functions in the previous section assume anything about the model form, we’ll be able to use them to train a CNN without any modification!

The first architecture that we will implement is the classic [LeNet-5](https://www.datasciencecentral.com/lenet-5-a-classic-cnn-architecture/)  architecture. We will use Pytorch’s predefined [torch.nn.Conv2d](https://pytorch.org/docs/stable/nn.html#conv2d) class as our convolutional layer. We define a CNN with 3 convolutional layers. Each convolution is followed by a [hyperbolic tangent](https://pytorch.org/docs/stable/generated/torch.nn.Tanh.html#torch.nn.Tanh) non-linear activation function and [average pooling](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html#torch.nn.AvgPool2d). After the three convolutional-subsampling layers, we add a couple of densely connected linear layers.

In [None]:
class LeNet5(nn.Module):
    
    def __init__(self):
        super().__init__()
        self._conv1 = nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=0)
        self._conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self._dense1 = nn.Linear(400, 120)
        self._dense2 = nn.Linear(120, 84)
        
    def forward(self, X):
        X = X.view(-1, 3, 32, 32) # implicit knowledge of CIFAR-10 data shape!
        X = F.avg_pool2d(F.tanh(self._conv1(X)), 2)
        X = F.avg_pool2d(F.tanh(self._conv2(X)), 2)
        X = X.view(X.size(0), -1)
        X = F.tanh(self._dense1(X))
        X = self._dense2(X)
        return X
    

In [None]:
model_fn = LeNet5()

In [None]:
torchinfo.summary(model_fn)

In [None]:
opt = optim.SGD(model_fn.parameters(), lr=1e-2, momentum=0.9)

In [None]:
# note that we can re-use the loss function as well as trainig and validation data loaders
fit(model_fn, loss_fn, train_data_loader, opt, val_data_loader)

# Refactor using `torch.nn.Sequential`

PyTorch has another handy class we can use to simply our code: [torch.nn.Sequential](https://pytorch.org/docs/stable/nn.html#sequential). A `Sequential` object runs each of the modules contained within it, in a sequential manner. This is a simpler way of writing our neural network.

To take advantage of this, we need to be able to easily define a custom layer from a given function. For instance, PyTorch doesn’t have a view layer, and we need to create one for our network. `LambdaLayer` will create a layer that we can then use when defining a network with `Sequential`.

In [None]:
class LambdaLayer(nn.Module):
    
    def __init__(self, f):
        super().__init__()
        self._f = f
        
    def forward(self, X):
        return self._f(X)


In [None]:
model_fn = nn.Sequential(
    LambdaLayer(lambda X: X.view(-1, 3, 32, 32)),
    nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=0),
    nn.Tanh(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
    nn.Tanh(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    LambdaLayer(lambda X: X.view(X.size(0), -1)),
    nn.Linear(400, 120),
    nn.Tanh(),
    nn.Linear(120, 84),
    nn.Tanh(),
    nn.Linear(84, 10)
)

opt = optim.SGD(model_fn.parameters(), lr=1e-2, momentum=0.9)

In [None]:
fit(model_fn,
    loss_fn,
    train_data_loader,
    opt,
    val_data_loader,
    number_epochs=2)

# Generalize our pipeline by wrapping our DataLoader

Our CNN is fairly concise, but it only works with CIFAR-10, because it assumes the input is a 3 * 32 * 32 long vector. Let’s get rid of this assumption, so our model works with any three channel image. First, we can remove the initial Lambda layer by moving the data preprocessing into a generator.

In [None]:
class WrappedDataLoader:
    
    def __init__(self, data_loader, f):
        self._data_loader = data_loader
        self._f = f
        
    def __len__(self):
        return len(self._data_loader)
    
    def __iter__(self):
        for batch in iter(self._data_loader):
            yield self._f(*batch)


In [None]:
model_fn = nn.Sequential(
    nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=0),
    nn.Tanh(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
    nn.Tanh(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    LambdaLayer(lambda X: X.view(X.size(0), -1)),
    nn.Linear(400, 120),
    nn.Tanh(),
    nn.Linear(120, 84),
    nn.Tanh(),
    nn.Linear(84, 10)
)

In [None]:
torchinfo.summary(model_fn, input_size=(64, 3, 32, 32))

In [None]:
opt = optim.SGD(model_fn.parameters(), lr=1e-2, momentum=0.9)

_preprocess = lambda X, y: (X.view(-1, 3, 32, 32), y)
train_data_loader = WrappedDataLoader(train_data_loader, _preprocess)
val_data_loader = WrappedDataLoader(val_data_loader, _preprocess)

In [None]:
fit(model_fn,
    loss_fn,
    train_data_loader,
    opt,
    val_data_loader)

# Add a learning rate scheduler

Adjusting the learning rate is often critical to achieving good convergence to a local optimum. Fortunately, adjusting the learning rate using PyTorch requires only minor modifications to our training loop. While the "best" way to adjust the learning rate is nearly always problem specific, starting with larger values and then decaying the learning rate each epoch is often a good strategy to try first. See the official PyTorch documentation for more on [tuning learning rates](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate).

In [None]:
optim.lr_scheduler?

In [None]:
def fit(model_fn, loss_fn, train_data_loader, opt, lr_scheduler, val_data_loader=None, number_epochs=2):
    
    for epoch in range(number_epochs):
        model_fn.train()
        for X_batch, y_batch in train_data_loader:
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)
        
        # compute validation loss after each training epoch
        if val_data_loader is not None:
            validate(epoch, model_fn, loss_fn, val_data_loader)
            
        lr_scheduler.step()

In [None]:
opt = optim.SGD(model_fn.parameters(), lr=1e-2, momentum=0.9)
lr_scheduler = optim.lr_scheduler.ExponentialLR(opt, gamma=0.9, verbose=True)

_preprocess = lambda X, y: (X.view(-1, 3, 32, 32), y)
train_data_loader = WrappedDataLoader(train_data_loader, _preprocess)
val_data_loader = WrappedDataLoader(val_data_loader, _preprocess)

In [None]:
fit(model_fn,
    loss_fn,
    train_data_loader,
    opt,
    lr_scheduler,
    val_data_loader)

# Experimenting with different architectures

In practice, it is unlikely that you will be designing your own neural network architectures from scratch. Instead you will be starting from some pre-existing neural network architecture. The [torchvision](https://pytorch.org/vision/stable/) project contains a number of neural network architectures that have found widespread use in computer vision applications.

In [None]:
from torchvision import models

In [None]:
models.

In [None]:
model_fn = models.resnet18(num_classes=10)

In [None]:
torchinfo.summary(model_fn, input_size=(64, 3, 32, 32))

Training this model with just a few CPUs would be impossible. In the next section we will see how to train large models like this using a GPU.