In [7]:
import sys
sys.path.insert(0, "../..")

import torch
import torch.nn as nn
from src.data import make_dataset
from pathlib import Path

In [8]:
datadir = Path("../../data/raw/")
train_dataloader, test_dataloader = make_dataset.get_MNIST(datadir, batch_size=64) 

In [9]:
len(train_dataloader), len(test_dataloader)

(938, 157)

We can obtain an item:

In [10]:
x, y = next(iter(train_dataloader))
x.shape, y.shape

(torch.Size([64, 1, 28, 28]), torch.Size([64]))

The image follows the channels-first convention: (channel, width, height). The label is an integer.

Lets pull this through a Conv2d layer:

In [11]:
conv = nn.Conv2d(
    in_channels=1, 
    out_channels=32,
    kernel_size=3,
    padding=(1,1))
out = conv(x)
out.shape

torch.Size([64, 32, 28, 28])

What is happening here? Can you explain all the parameters, and relate them to the outputshape?

Let's see what happens if we change the padding:

In [12]:
conv = nn.Conv2d(
    in_channels=1, 
    out_channels=32,
    kernel_size=3,
    padding=(0,0))
out = conv(x)
out.shape

torch.Size([64, 32, 26, 26])

And if we change the stride from the default 1 to 2:

In [13]:
conv = nn.Conv2d(
    in_channels=1, 
    out_channels=32,
    kernel_size=3,
    padding=(1,1),
    stride=2)
out = conv(x)
out.shape

torch.Size([64, 32, 14, 14])

As you can see, you need to think about what is going in and out of the convolution. We can stitch multiple layers together like this:

In [36]:
filter_size = 32

convolutions = nn.Sequential(
    nn.Conv2d(1, filter_size, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Conv2d(filter_size, filter_size, kernel_size=3, stride=1, padding=0),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Conv2d(filter_size, filter_size, kernel_size=3, stride=1, padding=0),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
)
out = convolutions(x)
out.shape

torch.Size([64, 32, 2, 2])

In [38]:

convolutions().size

TypeError: forward() missing 1 required positional argument: 'input'

As you can see, the dimensions of the featuremap have become really small. You need to take this into account: If we would have started with a smaller image, we could get errors...

In [15]:
x_too_small = torch.rand((32, 1, 12, 12))

try:
    convolutions(x_too_small)
except RuntimeError as err:
    print("ERROR:", err)

ERROR: Calculated padded input size per channel: (2 x 2). Kernel size: (3 x 3). Kernel size can't be greater than actual input size


At this point our `out` has 32 activation maps, each 2x2 big.

If we want to pull the activation maps through a neural network (A dense layer) we will need to flatten them (do you understand what happens if you dont do that?)

In [16]:
input_nn = nn.Flatten()(out)
input_nn.shape

torch.Size([64, 128])

Let's combine it all together:

In [34]:
import torch
from torch import nn

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class CNN(nn.Module ):
    def __init__(self,filter_size):
        super().__init__()
        self.filter_size = filter_size

        self.convolutions = nn.Sequential(
            nn.Conv2d(1, self.filter_size, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(self.filter_size, self.filter_size, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(self.filter_size, self.filter_size, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        
        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 10)
        )

    def forward(self, x):
        x = self.convolutions(x)
        logits = self.dense(x)
        return logits

model = CNN(filter_size=32).to(device)
print(model)

Using cpu device
CNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=10, bias=True)
  )
)


In [20]:
from src.models import train_model
train_model.count_parameters(model)

29482

We have about 30k parameters. You will always need to judge that relative to your input data: how many observations do you have? Do you think the model needs a lot of complexity, or not so much?

What is the trade off between adding more complexity? Or reducing complexity?

Try to answer this trade of in terms of:

- speed
- generalization
- accuracy

We will need to tell the model how good it is performing. To do that, we will need to pick a loss function $\mathcal{L}$. We will discuss this in more depth, but for now, just take my word for it that a CrossEntropyLoss is a good pick.

In [21]:
import torch.optim as optim
from src.models import metrics
optimizer = optim.Adam
loss_fn = torch.nn.CrossEntropyLoss()
accuracy = metrics.Accuracy()

In [22]:
yhat = model(x)
accuracy(y, yhat)

tensor(0.1562)

We now have everything we need to train the model.

In [24]:
model = train_model.trainloop(
    epochs=1,
    model=model,
    optimizer=optimizer,
    learning_rate=1e-3,
    loss_fn=loss_fn,
    metrics=[accuracy],
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    log_dir="../../models/test/",
    train_steps=len(train_dataloader),
    eval_steps=len(test_dataloader),
)

2022-12-04 10:53:55.334 | INFO     | src.data.data_tools:dir_add_timestamp:114 - Logging to ../../models/test/20221204-1053
100%|██████████| 938/938 [00:22<00:00, 41.89it/s]
2022-12-04 10:54:20.229 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 0.2265 test 0.2825 metric ['0.9010']
100%|██████████| 1/1 [00:24<00:00, 24.72s/it]


In [None]:
%tensorboard