In [None]:
#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [None]:
#hide
from fastai.vision.all import *
from fastbook import *

matplotlib.rc('image', cmap='Greys')

# Under the Hood: Training a Digit Classifier

## Pixels: The Foundations of Computer Vision

## Sidebar: Tenacity and Deep Learning

## End sidebar

In [None]:
path = untar_data(URLs.MNIST_SAMPLE)

In [None]:
#hide
Path.BASE_PATH = path

In [None]:
path.ls()

In [None]:
(path/'train').ls()

In [None]:
threes = (path/'train'/'3').ls().sorted()
sevens = (path/'train'/'7').ls().sorted()
threes

In [None]:
im3_path = threes[1]
im3 = Image.open(im3_path)
im3

In [None]:
array(im3)[4:10,4:10]

In [None]:
tensor(im3)[4:10,4:10]

In [None]:
im3_t = tensor(im3)
df = pd.DataFrame(im3_t[4:15,4:22])
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')

## First Try: Pixel Similarity

In [None]:
seven_tensors = [tensor(Image.open(o)) for o in sevens]
three_tensors = [tensor(Image.open(o)) for o in threes]
len(three_tensors),len(seven_tensors)

In [None]:
show_image(three_tensors[1]);

In [None]:
stacked_sevens = torch.stack(seven_tensors).float()/255
stacked_threes = torch.stack(three_tensors).float()/255
stacked_threes.shape

In [None]:
len(stacked_threes.shape)

In [None]:
stacked_threes.ndim

In [None]:
mean3 = stacked_threes.mean(0)
show_image(mean3);

In [None]:
mean7 = stacked_sevens.mean(0)
show_image(mean7);

In [None]:
a_3 = stacked_threes[1]
show_image(a_3);

In [None]:
dist_3_abs = (a_3 - mean3).abs().mean()
dist_3_sqr = ((a_3 - mean3)**2).mean().sqrt()
dist_3_abs,dist_3_sqr

In [None]:
dist_7_abs = (a_3 - mean7).abs().mean()
dist_7_sqr = ((a_3 - mean7)**2).mean().sqrt()
dist_7_abs,dist_7_sqr

In [None]:
F.l1_loss(a_3.float(),mean7), F.mse_loss(a_3,mean7).sqrt()

### NumPy Arrays and PyTorch Tensors

In [None]:
data = [[1,2,3],[4,5,6]]
arr = array (data)
tns = tensor(data)

In [None]:
arr  # numpy

In [None]:
tns  # pytorch

In [None]:
tns[1]

In [None]:
tns[:,1]

In [None]:
tns[1,1:3]

In [None]:
tns+1

In [None]:
tns.type()

In [None]:
tns*1.5

## Computing Metrics Using Broadcasting

In [None]:
valid_3_tens = torch.stack([tensor(Image.open(o)) 
                            for o in (path/'valid'/'3').ls()])
valid_3_tens = valid_3_tens.float()/255
valid_7_tens = torch.stack([tensor(Image.open(o)) 
                            for o in (path/'valid'/'7').ls()])
valid_7_tens = valid_7_tens.float()/255
valid_3_tens.shape,valid_7_tens.shape

In [None]:
def mnist_distance(a,b): return (a-b).abs().mean((-1,-2))
mnist_distance(a_3, mean3)

In [None]:
valid_3_dist = mnist_distance(valid_3_tens, mean3)
valid_3_dist, valid_3_dist.shape

In [None]:
tensor([1,2,3]) + tensor(1)

In [None]:
(valid_3_tens-mean3).shape

In [None]:
def is_3(x): return mnist_distance(x,mean3) < mnist_distance(x,mean7)

In [None]:
is_3(a_3), is_3(a_3).float()

In [None]:
is_3(valid_3_tens)

In [None]:
accuracy_3s =      is_3(valid_3_tens).float() .mean()
accuracy_7s = (1 - is_3(valid_7_tens).float()).mean()

accuracy_3s,accuracy_7s,(accuracy_3s+accuracy_7s)/2

## Stochastic Gradient Descent (SGD)

In [None]:
gv('''
init->predict->loss->gradient->step->stop
step->predict[label=repeat]
''')

In [None]:
def f(x): return x**2

In [None]:
plot_function(f, 'x', 'x**2')

In [None]:
plot_function(f, 'x', 'x**2')
plt.scatter(-1.5, f(-1.5), color='red');

### Calculating Gradients

In [None]:
xt = tensor(3.).requires_grad_()

In [None]:
yt = f(xt)
yt

In [None]:
yt.backward()

In [None]:
xt.grad

In [None]:
xt = tensor([3.,4.,10.]).requires_grad_()
xt

In [None]:
def f(x): return (x**2).sum()

yt = f(xt)
yt

In [None]:
yt.backward()
xt.grad

### Stepping With a Learning Rate

### An End-to-End SGD Example

In [None]:
time = torch.arange(0,20).float(); time

In [None]:
speed = torch.randn(20)*3 + 0.75*(time-9.5)**2 + 1
plt.scatter(time,speed);

In [None]:
def f(t, params):
    a,b,c = params
    return a*(t**2) + (b*t) + c

In [None]:
def mse(preds, targets): return ((preds-targets)**2).mean()

#### Step 1: Initialize the parameters

In [None]:
params = torch.randn(3).requires_grad_()

In [None]:
#hide
orig_params = params.clone()

#### Step 2: Calculate the predictions

In [None]:
preds = f(time, params)

In [None]:
def show_preds(preds, ax=None):
    if ax is None: ax=plt.subplots()[1]
    ax.scatter(time, speed)
    ax.scatter(time, to_np(preds), color='red')
    ax.set_ylim(-300,100)

In [None]:
show_preds(preds)

#### Step 3: Calculate the loss

In [None]:
loss = mse(preds, speed)
loss

#### Step 4: Calculate the gradients

In [None]:
loss.backward()
params.grad

In [None]:
params.grad * 1e-5

In [None]:
params

#### Step 5: Step the weights. 

In [None]:
lr = 1e-5
params.data -= lr * params.grad.data
params.grad = None

In [None]:
preds = f(time,params)
mse(preds, speed)

In [None]:
show_preds(preds)

In [None]:
def apply_step(params, prn=True):
    preds = f(time, params)
    loss = mse(preds, speed)
    loss.backward()
    params.data -= lr * params.grad.data
    params.grad = None
    if prn: print(loss.item())
    return preds

#### Step 6: Repeat the process 

In [None]:
for i in range(10): apply_step(params)

In [None]:
#hide
params = orig_params.detach().requires_grad_()

In [None]:
_,axs = plt.subplots(1,4,figsize=(12,3))
for ax in axs: show_preds(apply_step(params, False), ax)
plt.tight_layout()

#### Step 7: stop

### Summarizing Gradient Descent

In [None]:
gv('''
init->predict->loss->gradient->step->stop
step->predict[label=repeat]
''')

## The MNIST Loss Function

In [None]:
train_x = torch.cat([stacked_threes, stacked_sevens]).view(-1, 28*28)

In [None]:
train_y = tensor([1]*len(threes) + [0]*len(sevens)).unsqueeze(1)
train_x.shape,train_y.shape

In [None]:
dset = list(zip(train_x,train_y))
x,y = dset[0]
x.shape,y

In [None]:
valid_x = torch.cat([valid_3_tens, valid_7_tens]).view(-1, 28*28)
valid_y = tensor([1]*len(valid_3_tens) + [0]*len(valid_7_tens)).unsqueeze(1)
valid_dset = list(zip(valid_x,valid_y))

In [None]:
def init_params(size, std=1.0): return (torch.randn(size)*std).requires_grad_()

In [None]:
weights = init_params((28*28,1))

In [None]:
bias = init_params(1)

In [None]:
(train_x[0]*weights.T).sum() + bias

In [None]:
def linear1(xb): return xb@weights + bias
preds = linear1(train_x)
preds

In [None]:
corrects = (preds>0.0).float() == train_y
corrects

In [None]:
corrects.float().mean().item()

In [None]:
with torch.no_grad(): weights[0] *= 1.0001

In [None]:
preds = linear1(train_x)
((preds>0.0).float() == train_y).float().mean().item()

In [None]:
trgts  = tensor([1,0,1])
prds   = tensor([0.9, 0.4, 0.2])

In [None]:
def mnist_loss(predictions, targets):
    return torch.where(targets==1, 1-predictions, predictions).mean()

In [None]:
torch.where(trgts==1, 1-prds, prds)

In [None]:
mnist_loss(prds,trgts)

In [None]:
mnist_loss(tensor([0.9, 0.4, 0.8]),trgts)

### Sigmoid

In [None]:
def sigmoid(x): return 1/(1+torch.exp(-x))

In [None]:
plot_function(torch.sigmoid, title='Sigmoid', min=-4, max=4)

In [None]:
def mnist_loss(predictions, targets):
    predictions = predictions.sigmoid()
    return torch.where(targets==1, 1-predictions, predictions).mean()

### SGD and Mini-Batches

In [None]:
coll = range(15)
dl = DataLoader(coll, batch_size=5, shuffle=True)
list(dl)

In [None]:
ds = L(enumerate(string.ascii_lowercase))
ds

In [None]:
dl = DataLoader(ds, batch_size=6, shuffle=True)
list(dl)

## Putting It All Together

In [None]:
weights = init_params((28*28,1))
bias = init_params(1)

In [None]:
dl = DataLoader(dset, batch_size=256)
xb,yb = first(dl)
xb.shape,yb.shape

In [None]:
valid_dl = DataLoader(valid_dset, batch_size=256)

In [None]:
batch = train_x[:4]
batch.shape

In [None]:
preds = linear1(batch)
preds

In [None]:
loss = mnist_loss(preds, train_y[:4])
loss

In [None]:
loss.backward()
weights.grad.shape,weights.grad.mean(),bias.grad

In [None]:
def calc_grad(xb, yb, model):
    preds = model(xb)
    loss = mnist_loss(preds, yb)
    loss.backward()

In [None]:
calc_grad(batch, train_y[:4], linear1)
weights.grad.mean(),bias.grad

In [None]:
calc_grad(batch, train_y[:4], linear1)
weights.grad.mean(),bias.grad

In [None]:
weights.grad.zero_()
bias.grad.zero_();

In [None]:
def train_epoch(model, lr, params):
    for xb,yb in dl:
        calc_grad(xb, yb, model)
        for p in params:
            p.data -= p.grad*lr
            p.grad.zero_()

In [None]:
(preds>0.0).float() == train_y[:4]

In [None]:
def batch_accuracy(xb, yb):
    preds = xb.sigmoid()
    correct = (preds>0.5) == yb
    return correct.float().mean()

In [None]:
batch_accuracy(linear1(batch), train_y[:4])

In [None]:
def validate_epoch(model):
    accs = [batch_accuracy(model(xb), yb) for xb,yb in valid_dl]
    return round(torch.stack(accs).mean().item(), 4)

In [None]:
validate_epoch(linear1)

In [None]:
lr = 1.
params = weights,bias
train_epoch(linear1, lr, params)
validate_epoch(linear1)

In [None]:
for i in range(20):
    train_epoch(linear1, lr, params)
    print(validate_epoch(linear1), end=' ')

### Creating an Optimizer

In [None]:
linear_model = nn.Linear(28*28,1)

In [None]:
w,b = linear_model.parameters()
w.shape,b.shape

In [None]:
class BasicOptim:
    def __init__(self,params,lr): self.params,self.lr = list(params),lr

    def step(self, *args, **kwargs):
        for p in self.params: p.data -= p.grad.data * self.lr

    def zero_grad(self, *args, **kwargs):
        for p in self.params: p.grad = None

In [None]:
opt = BasicOptim(linear_model.parameters(), lr)

In [None]:
def train_epoch(model):
    for xb,yb in dl:
        calc_grad(xb, yb, model)
        opt.step()
        opt.zero_grad()

In [None]:
validate_epoch(linear_model)

In [None]:
def train_model(model, epochs):
    for i in range(epochs):
        train_epoch(model)
        print(validate_epoch(model), end=' ')

In [None]:
train_model(linear_model, 20)

In [None]:
linear_model = nn.Linear(28*28,1)
opt = SGD(linear_model.parameters(), lr)
train_model(linear_model, 20)

In [None]:
dls = DataLoaders(dl, valid_dl)

In [None]:
learn = Learner(dls, nn.Linear(28*28,1), opt_func=SGD,
                loss_func=mnist_loss, metrics=batch_accuracy)

In [None]:
learn.fit(10, lr=lr)

## Adding a Nonlinearity

In [None]:
def simple_net(xb): 
    res = xb@w1 + b1
    res = res.max(tensor(0.0))
    res = res@w2 + b2
    return res

In [None]:
w1 = init_params((28*28,30))
b1 = init_params(30)
w2 = init_params((30,1))
b2 = init_params(1)

In [None]:
plot_function(F.relu)

In [None]:
simple_net = nn.Sequential(
    nn.Linear(28*28,30),
    nn.ReLU(),
    nn.Linear(30,1)
)

In [None]:
learn = Learner(dls, simple_net, opt_func=SGD,
                loss_func=mnist_loss, metrics=batch_accuracy)

In [None]:
learn.fit(40, 0.1)

In [None]:
plt.plot(L(learn.recorder.values).itemgot(2));

In [None]:
learn.recorder.values[-1][2]

### Going Deeper

In [None]:
dls = ImageDataLoaders.from_folder(path)
learn = vision_learner(dls, resnet18, pretrained=False,
                    loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(1, 0.1)

## Jargon Recap

## Questionnaire

1. How is a grayscale image represented on a computer? How about a color image?
> A grayscale image is represented by a matrix with a value corresponding to the intensity of each pixel. 0 corresponds to white, and 255 corresponds to black. Gray values are hence somewhere in the middle. An RGB image is similar, but each "pixel" is encoded by a vector of length 3, that contains the intensity of the red, green and blue channels. This results in a 3D array or a rank-3 tensor.

2. How are the files and folders in the `MNIST_SAMPLE` dataset structured? Why?
> `train` and `valid` denote the training and validation datasets, which are used for the predictions and the gradient descent respectively. Each of these is spearated into subfolders for each digit, as is conventional with classification datasets. Each subfolder contains all the images of its associated digit.

3. Explain how the "pixel similarity" approach to classifying digits works.
> First, an average image of each digit is obtained by calculating the pixelwise mean of all the images in the dataset. Then, to classify whether an image is a three or a seven, the test image is compared against the mean image of each, and if the "distance" is closer to the 3, its a 3, and vice versa. THe distance is quantified by the L1 norm (modulus of mean - test) or the MSE (mean - test)**2.

4. What is a list comprehension? Create one now that selects odd numbers from a list and doubles them.
> List comprehension is a process in python where a list of objects can be created in a single line. This is usually done via a for loop in other languages. `if` statements can be included for filtering.  
> For Example:  
        `
        lst = [el for el in original_list if el % 2 != 0]
        `

5. What is a "rank-3 tensor"?
> A rank 3 tensor is a tensor with 3 dimensions.

6. What is the difference between tensor rank and shape? How do you get the rank from the shape?
> The shape of a tensor is the size of each of its dimensions. The rank is the number of dimensions it has.

7. What are RMSE and L1 norm?
> RMSE is the "root mean squared error" - the mean of the mean squared errors of every data point to its prediction. It can also be written as the root of the L2 norm. The L1 norm is just the absolute value of the distance between all the data poitns and their predictions.

8. How can you apply a calculation on thousands of numbers at once, many thousands of times faster than a Python loop?
> This can be done with the PyTorch or fastai libraries which are written in C and run on the GPU, which is much faster than python for loops that run on the CPU.

9. Create a 3×3 tensor or array containing the numbers from 1 to 9. Double it. Select the bottom-right four numbers.
>
> ```ten = torch.Tensor(list(range(1, 10))).view(3, 3)```  
> ```ten_2 = 2 * ten```  
> ```ten_2[1:, 1:] # tensor([[10., 12.], [16., 18.]])```

10. What is broadcasting?
> Broadcasting occurs when an operation between two tensors of different ranks is executed. The lower-rank tensor is broadcasted to the same rank as the higher-rank tensor, by 'duplicating' it in one or more dimensions (*note:* it does not actually create copies in memory).

11. Are metrics generally calculated using the training set, or the validation set? Why?
> They are calculated on the validation set because this tests the models ability to predict patterns in tha data generally, and not simply memorise the dataset that it is training on.

12. What is SGD?
> Stochastic gradient descent is an optimisation algorithm. It uses the parameters in a model to minimise a given loss function. To do this, it computes the gradient of the loss function given the current parameters and modifies the parameters in such a way that the loss function may become smaller on the next iteration. 

13. Why does SGD use mini-batches?
> SGD uses mini batches to train the model, and calculates the loss at one (or more) data points. If we used every data point, each loss calculation would be extremely computationally expensive, especially in large datasets. However, if only one data point was used, the loss would fluctuate heavily and be very unstable. The mini-batches provide a medium between these two extremes.

14. What are the seven steps in SGD for machine learning?
> 1. Initialize the parameters (random).
> 2. Calculate the predictions.
> 3. Calcualte the loss (average loss over a minibatch).
> 4. Calculate the gradients (how the loss changes with a small change in parameters).
> 5. Step the weights according to the gradient and the learning rate hyperparameter.
> 6. Repeat the process.
> 7. Stop when a loss/metric/number of epochs/time has elapsed.
  
15. How do we initialize the weights in a model?
> This is done randomly.
16. What is "loss"?
> Loss is a measure of how close the model is to predicting the correct results, lower values correspond to better model predictions (convention).

17. Why can't we always use a high learning rate?
> The loss may in this case bounce around *near* the minimum, but never reach the minimum (loss). 

18. What is a "gradient"?
> It is a vector describing how quickly the loss changes with respect to each weight. It is calculated at the current parameter set.

19. Do you need to know how to calculate gradients yourself?
> No, machine learning toolkits implement numerical methods to do it for you.

20. Why can't we use accuracy as a loss function?
> Accuracy is not a good metric, especially for classification models. This is becuase accuracy only changes when the predictions change, even if the model is making a more confident (accurate) prediction with new weights. Also, weights only update when the loss is changing significantly, hence the need for a non-accuracy loss function.

21. Draw the sigmoid function. What is special about its shape?
> It is bounded between 0 and 1.

22. What is the difference between a loss function and a metric?
> Metrics are used for human understanding, and reflect the performance of the model Loss fucntions are differentiable (have a derivative) and are solely for the purpose of training the model. Their values are not that useful for humans.

23. What is the function to calculate new weights using a learning rate?
> Optimizer step.

24. What does the `DataLoader` class do?
> Takes in a Python collection and turns it into an iterator over many batches.

25. Write pseudocode showing the basic steps taken in each epoch for SGD.
> `for x, y in dl:`  
> `     pred = model(x)`  
> `     loss = loss_func(y, pred)`  
> `     loss.backward()`  
>`      params -= params.grad * lr`  

26. Create a function that, if passed two arguments `[1,2,3,4]` and `'abcd'`, returns `[(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]`. What is special about that output data structure?
> `def func(a, b): return list(zip(a, b))`  
> This is useful because it maps things in one list to things in another, useful in ML where we need features mapped to a prediction.

27. What does `view` do in PyTorch?
> This reshapes a tensor.

28. What are the "bias" parameters in a neural network? Why do we need them?
> This is the `b` in `y = mx + b`, ensures that if a neuron has an input of 0, its output isn't necessarily 0 for any weight.
29. What does the `@` operator do in Python?
> Matrix multiplication.
30. What does the `backward` method do?
> Computes the gradient (derivative).
31. Why do we have to zero the gradients?
> Because otherwise the current gradients are added to the previous value of the gradients.

32. What information do we have to pass to `Learner`?
> The dataloaders, the model, the optimization fucntion, the loss function and any metrics to print (optional).

33. Show Python or pseudocode for the basic steps of a training loop.
> `def train_epoch(model, lr, params):  
    for xb,yb in dl:  
        calc_grad(xb, yb, model)  
        for p in params:  
            p.data -= p.grad*lr  
            p.grad.zero_()  
    for i in range(20):  
        train_epoch(model, lr, params)`
34. What is "ReLU"? Draw a plot of it for values from `-2` to `+2`.
> Relu is a function that returns 0 if the output is negtive.
35. What is an "activation function"?
> An activation function decouples each layer from each other, making non-linear models possible.
36. What's the difference between `F.relu` and `nn.ReLU`?
> nn.RelU is a python class, whereas F.relu is a function.
37. The universal approximation theorem shows that any function can be approximated as closely as needed using just one nonlinearity. So why do we normally use more?
> There are practical performance benefits to using more than one nonlinearity. We can use a deeper model with less number of parameters, better performance, faster training, and less compute/memory requirements.

### Further Research

1. Create your own implementation of `Learner` from scratch, based on the training loop shown in this chapter.
1. Complete all the steps in this chapter using the full MNIST datasets (that is, for all digits, not just 3s and 7s). This is a significant project and will take you quite a bit of time to complete! You'll need to do some of your own research to figure out how to overcome some obstacles you'll meet on the way.