In [1]:
# Sinan Yumurtaci
# Code along for Chapter 4 of fast.ai's Practical Deep Learning for Coders

# Digit classification task
# Distinguish between images of digits
# Data source: MNIST

# 2nd section dedicated for using a NN on MNIST

# 2021-03-30

In [4]:
# imports as we follow along with fastbook

from fastbook import *
setup_book()
from fastai.vision.widgets import *

matplotlib.rc("image", cmap = "Greys")

In [5]:
# load the MNIST dataset
# book loads only 3 and 7 for demonstration.

# I'll be working on the classification for 3s and 5s
# so that I learn new stuff!
path = untar_data(URLs.MNIST)
threes = (path/"training"/"3").ls().sorted()
fives = (path/"training"/"5").ls().sorted()
print(f"There are {len(threes)} 3s and {len(fives)} 5s.")

# load the actual images into tensors
three_tensors = [tensor(Image.open(f)) for f in threes]
five_tensors = [tensor(Image.open(f)) for f in fives]

print(f"Loaded {len(three_tensors)} 3s and {len(five_tensors)} 5s!")

# stack the images for faster computation with PyTorch
# also convert from [0, 255] to [0, 1]
stacked_threes = torch.stack(three_tensors).float()/255
stacked_fives = torch.stack(five_tensors).float()/255

There are 6131 3s and 5421 5s.
Loaded 6131 3s and 5421 5s!


In [8]:
# put our training data in one variable
train_x = torch.cat([stacked_threes, stacked_fives]).view(-1, 28*28)

In [9]:
# create a tensor of labels for our training data
train_y = tensor([1]*len(threes) + [0] * len(fives)).unsqueeze(1)
train_x.shape, train_y.shape

(torch.Size([11552, 784]), torch.Size([11552, 1]))

In [11]:
# zip our data and labels into a dataset (a tuple of (x,y), as PyTorch requires)
dset = list(zip(train_x, train_y))

In [22]:
# repeat the above steps for our test set
test_3_tens = torch.stack([tensor(Image.open(o)) for o in (path/'testing'/'3').ls()]).float()/255
test_5_tens = torch.stack([tensor(Image.open(o)) for o in (path/'testing'/'5').ls()]).float()/255

test_x = torch.cat([test_3_tens, test_5_tens]).view(-1, 28*28)
test_y = tensor([1]*len(test_3_tens) + [0] * len(test_5_tens)).unsqueeze(1)
test_dset = list(zip(test_x, test_y))

len(dset), len(test_dset)

(11552, 1902)

In [25]:
# create function that will return randomly initialized parameters of the specified size
def init_params(size, std = 1.0):
    return (torch.randn(size) * std).requires_grad_()

weights = init_params((28*28,1))
bias    = init_params(1)

In [26]:
# create function that will calculate prediction using the input, and the weights and the bias
def linear_calc(xb):
    return xb @ weights + bias

# calculate predictions for the training set using our initial, random parameters
preds = linear_calc(train_x)
preds

tensor([[-3.3087],
        [-3.4366],
        [ 0.8637],
        ...,
        [ 7.1891],
        [ 9.6996],
        [16.6568]], grad_fn=<AddBackward0>)

In [30]:
# calculate our accuracy on the training set
corrects = (preds > 0.5).float() == train_y # if prediction is stronger than 0.5, count as predicting for 3. else, 5
corrects.float().mean().item()

0.36088988184928894

## A Loss Function
Simply accuracy won't work as a loss function, because it is not continious. If we were to change one of the parameters slightly, the accuracy would not change, as demonstrated below:

In [31]:
weights[0] *= 1.00001
preds = linear_calc(train_x)
((preds>0.5).float() == train_y).float().mean().item()

0.36088988184928894

## What's the Problem?
The accuracy is the same after the small change, so our model won't have any way of knowing which small steps to take to better fit our training data. We need another metric that is continious and responds to small changes like above, so that we can deduce the next small step to take in the optimization loop.

In [33]:
# create a loss function for this specific MNIST task of distinguishing between 3s and 5s
def mnist_loss(predictions, targets):
    predictions = predictions.sigmoid() # sigmoid ensures that all values are between 0 and 1
    # so that we can use the mean of distance from the target in the standardized units of 0 and 1.
    return torch.where(targets == 1, 1-predictions, predictions).mean()

## Important Note: Metrics vs Loss Functions
Metrics is what we care about as humans. This is usually some form of accuracy; we care about how accurate models are.

The machine learning model does not directly optimize for the metric. It insteads minimizes its loss function. This is easier because loss functions are designed to have meaningful gradients.

Improving the loss function also tends to improve the metric simply because we design loss functions specifically for that use case.

## Putting It All Together

optimization_step:

    pred = model(x)
    loss = loss_func(pred, x)
    loss.backward()
    params -= parameters.grad * lr

In [34]:
# initialize our parameters
weights = init_params((28*28, 1))
bias = init_params(1)

In [37]:
# create dataLoader from our dataset (zip of inputs and targets)
dl = DataLoader(dset, batch_size=256)
xb,yb = first(dl)
xb.shape,yb.shape

(torch.Size([256, 784]), torch.Size([256, 1]))

In [39]:
test_dl = DataLoader(test_dset, batch_size=256) # same for our testing set

In [40]:
# create a sub-function that will do a forward and backward pass
def calc_grad(xb, yb, model):
    preds = model(xb)
    loss = mnist_loss(preds, yb)
    loss.backward()

# function for one epoch (iteration over the dataset)
def train_epoch(model, lr, params):
    for xb,yb in dl:
        calc_grad(xb, yb, model)
        for p in params:
            p.data -= p.grad*lr # using p.data rather than p prevents the calculation of the graident for this step
            p.grad.zero_() # reset the gradient after each mini-batch

In [41]:
# create functions for tracking the accuracy of our model
def batch_accuracy(xb, yb):
    preds = xb.sigmoid()
    correct = (preds>0.5) == yb
    return correct.float().mean()

def epoch_accuracy(model):
    accs = [batch_accuracy(model(xb), yb) for xb,yb in test_dl]
    return round(torch.stack(accs).mean().item(), 4)

epoch_accuracy(linear_calc) # this is our starting accuracy with random parameters


0.6444

In [48]:
# train for 100 epochs to test our approach
lr = 1.
params = weights, bias # puts the two tensors together in a tuple!

for i in range(100):
    train_epoch(linear_calc, lr, params)
    print(epoch_accuracy(linear_calc), end=' ')

0.9036 0.9061 0.908 0.911 0.9124 0.9139 0.9154 0.9173 0.9173 0.9198 0.9222 0.9242 0.9251 0.9261 0.9266 0.9276 0.9281 0.9295 0.931 0.9315 0.931 0.9315 0.931 0.9315 0.9315 0.9329 0.9329 0.9325 0.9344 0.9349 0.9349 0.9349 0.9349 0.9349 0.9349 0.9349 0.9349 0.9354 0.9369 0.9378 0.9383 0.9383 0.9393 0.9393 0.9393 0.9398 0.9408 0.9408 0.9412 0.9417 0.9422 0.9422 0.9427 0.9432 0.9432 0.9437 0.9432 0.9442 0.9442 0.9447 0.9452 0.9452 0.9452 0.9456 0.9461 0.9471 0.9471 0.9476 0.9476 0.9471 0.9471 0.9471 0.9476 0.9476 0.9476 0.9476 0.9476 0.9476 0.9476 0.9481 0.9481 0.9476 0.9476 0.9481 0.9481 0.9486 0.9486 0.9486 0.9486 0.9491 0.9491 0.9491 0.9491 0.9491 0.9491 0.9495 0.9495 0.95 0.95 0.9505 

## Using PyTorch Modules
Modules are classes that encapsulate the most common behavior we've implementer above. We can use nn.Linear for our simple linear model above.

In [None]:
linear_model = nn.Linear(28*28,1)

# optimizer will handle the changes in the parameters
class BasicOptim:
    def __init__(self,params,lr): self.params,self.lr = list(params),lr

    def step(self, *args, **kwargs):
        for p in self.params: p.data -= p.grad.data * self.lr

    def zero_grad(self, *args, **kwargs):
        for p in self.params: p.grad = None

In [None]:
# create instance
opt = BasicOptim(linear_model.parameters(), lr)