In [146]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [147]:
#expor
from exp.nb_02 import *
import torch.nn.functional as F

## Initial setup
### Data

In [148]:
mpl.rcParams['image.cmap'] = 'gray'

In [149]:
x_train, y_train, x_valid, y_valid = get_data()

In [150]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50

In [151]:
class Model(nn.Module):
    def __init__(self, n_in, n_h, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [152]:
model = Model(m, nh, c.item())

In [153]:
pred = model(x_train)

## Cross entropy loss
\begin{equation}\operatorname{softmax}(x)_i=\frac{e^{x_i}}{\sum_j e^{x_j}}\end{equation}

*The `keepdim=True` so that the sum can be broadcasted along the `x.exp()` tensor.*

In [154]:
def log_softmax(x):
    return (x.exp()/(x.exp().sum(dim=-1, keepdim=True))).log()

In [155]:
sm_pred = log_softmax(pred)

The cross entropy loss is given by 
\begin{equation}-\sum x\log p(x)
\end{equation}
where $x$ is the target and $p(x)$ the prediction. As our targets are one-hot encoded, this can be rewritten as $-\log p_i$. How can we express this in a nice way in PyTorch? We use numpy-style *integer array indexing*.

In [156]:
y_train[:5]

tensor([5, 0, 4, 1, 9])

What are our predictions associated with *5, 0, 4, 1, 9*?

In [157]:
sm_pred[0][5], sm_pred[1][0], sm_pred[2][4], sm_pred[3][1], sm_pred[4][9]

(tensor(-2.4213, grad_fn=<SelectBackward>),
 tensor(-2.2970, grad_fn=<SelectBackward>),
 tensor(-2.2156, grad_fn=<SelectBackward>),
 tensor(-2.1837, grad_fn=<SelectBackward>),
 tensor(-2.2987, grad_fn=<SelectBackward>))

In [158]:
sm_pred[[0,1,2,3,4], [5, 0, 4, 1, 9]]

tensor([-2.4213, -2.2970, -2.2156, -2.1837, -2.2987], grad_fn=<IndexBackward>)

or

In [159]:
sm_pred[range(5), y_train[:5]]

tensor([-2.4213, -2.2970, -2.2156, -2.1837, -2.2987], grad_fn=<IndexBackward>)

In [160]:
y_train.shape[0]

50000

In [161]:
def nll(input, target):
    return -input[range(target.shape[0]), target].mean()

In [162]:
loss = nll(sm_pred, y_train)

In [163]:
loss

tensor(2.3110, grad_fn=<NegBackward>)

We can rewrite the loss using $\log(a/b)=\log(a)-\log(b)$:

In [164]:
def log_softmax(x):
    return x - (x.exp().sum(dim=-1, keepdim=True)).log()

In [165]:
test_near(loss, nll(log_softmax(pred), y_train))

However, we would like to find a way to calculate the nll loss in a more stable way because the sum of the exponentials can get very large and overflow when taking the exp of a big activation. We use the so-called LogSumExp trick:

\begin{equation}
\log\left(\sum_{j=1}^ne^{x_j}\right)=\log\left(e^a\sum_{j=1}^ne^{x_j - a}\right) =a+ \log\left(\sum_{j=1}^ne^{x_j-a}\right)
\end{equation}

For $a$ we use the maximum of all $x_j$.

In [166]:
def logSumExp(x):
    a = x.max(dim=-1)[0]  # The shape of pred is torch.Size([50000, 10]) and [0] to get the values (not indxs)
    return a + (x-a.unsqueeze(-1)).exp().sum(dim=-1).log()  # The .unsqueeze so that a can be broadcasted

In [167]:
logSumExp(pred)

tensor([2.2823, 2.2651, 2.2626,  ..., 2.2613, 2.2491, 2.2374],
       grad_fn=<AddBackward0>)

*In PyTorch this is already implemented for us!*

In [168]:
test_near(logSumExp(pred), pred.logsumexp(-1))

In [169]:
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

In [170]:
test_near(nll(log_softmax(pred), y_train), loss)

Now we can also use PyTorch's implementation:

In [171]:
test_near(F.nll_loss(F.log_softmax(pred, -1), y_train), loss)

PyTorch combines `F.log_softmax` and `nll_loss` into one optimized function called `F.cross_entropy`:

In [172]:
test_near(F.cross_entropy(pred, y_train), loss)

# Basic training loop

In [173]:
loss_func = F.cross_entropy

In [174]:
#export
def accuracy(pred, targ):
    return (torch.argmax(pred, dim=-1)==targ).float().mean()

In [175]:
bs = 64                   # batch size
x_batch  = x_train[:bs]   # a mini-batch
preds = model(x_batch)
preds[0], preds.shape

(tensor([-0.0938,  0.1359,  0.0830, -0.1576,  0.1403, -0.1390, -0.0376, -0.1107,
          0.0235, -0.1058], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [176]:
accuracy(preds, y_train[:bs])

tensor(0.0938)

In [177]:
loss_func(preds, y_train[:bs])

tensor(2.3151, grad_fn=<NllLossBackward>)

In [178]:
lr = 0.5
epochs = 1

In [188]:
for epoch in range(epochs):
    for i in range((n-1)//bs):
        
        start_idx = i * bs
        end_idx = start_idx + bs
        x_batch = x_train[start_idx:end_idx]
        y_batch = y_train[start_idx:end_idx]
        
        loss = loss_func(model(x_batch), y_batch)
        
        loss.backward()
        with torch.no_grad():  # Not part of the gradient calculation but the result of it
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()

In [189]:
loss_func(model(x_valid), y_valid)

tensor(0.9456, grad_fn=<NllLossBackward>)

In [190]:
accuracy(model(x_valid), y_valid)

tensor(0.7174)

**Let's make this less messy!**

## Introduce parameters
We use `nn.Module.__setattr__` and move the relu to functional:

In [193]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
    
    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

In [194]:
model = Model(m, nh, 10)

In [195]:
for name, l in model.named_children():
    print(f"{name}: {l}")

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)


In [196]:
model

Model(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [197]:
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [204]:
def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs):
            
            start_idx = i * bs
            end_idx = start_idx + bs
            x_batch = x_train[start_idx:end_idx]
            y_batch = y_train[start_idx:end_idx]
            
            loss = loss_func(model(x_batch), y_batch)
            
            loss.backward()
            with torch.no_grad():  # Not part of the gradient calculation but the result of it
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()

In [205]:
fit()

In [207]:
loss_func(model(x_valid), y_valid)

tensor(0.1695, grad_fn=<NllLossBackward>)

In [208]:
accuracy(model(x_valid), y_valid)

tensor(0.9497)

**How does this work behind the scenes?**

In [232]:
class DummyModule():
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
        
    def __setattr__(self, k, v):
        if not k.startswith("_"):
            self._modules[k] = v
        super().__setattr__(k,v)
    
    def __repr__(self):
        return f'{self._modules}'
    
    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters():
                yield p

In [233]:
mdl = DummyModule(m, nh, 10)

In [234]:
mdl

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [235]:
[o.shape for o in mdl.parameters()]

[torch.Size([50, 784]),
 torch.Size([50]),
 torch.Size([10, 50]),
 torch.Size([10])]

In [236]:
mdl.__getattribute__("l1")

Linear(in_features=784, out_features=50, bias=True)

**What do we do if we want to pass a list of Layers to the constructor instead of defining them individually with `self.`?**

In [237]:
layers = [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10)]

In [238]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i,l in enumerate(layers):
            self.add_module(f'layer_{i}', l)
            
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [239]:
nn.Module.add_module??

**Summary: checks if `l` has a compatible type and then does `self._modules[name] = module`, just as we did!**

In [243]:
model = Model(layers)

In [244]:
model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

In [245]:
nn.Module.__repr__??

**`nn.ModuleList` does exactly that for us!**

In [247]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x  

In [248]:
model = SequentialModel(layers)

In [249]:
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [250]:
fit()

In [251]:
loss_func(model(x_valid), y_valid)

tensor(0.1641, grad_fn=<NllLossBackward>)

In [252]:
accuracy(model(x_valid), y_valid)

tensor(0.9519)

**`nn.Sequential` does the exact same thing very conveniently four us:**

In [254]:
model = nn.Sequential(*layers)

In [255]:
fit()

In [256]:
loss_func(model(x_valid), y_valid)

tensor(0.1441, grad_fn=<NllLossBackward>)

In [257]:
accuracy(model(x_valid), y_valid)

tensor(0.9588)

In [258]:
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

In [259]:
nn.Sequential??

## Introduce optimizers
Whe would like to replace the lines

```
with torch.no_grad():
    for p in model.parameters():
        p -= p.grad * lr
        model.zero_grad()
```

with something like this:

```
opt.step()
opt.zero_grad()
```

Also note that it is important that we change from `model.zero_grad()` to `opt.zero_grad()` as we might want to ask our optimizer to take care of only *part* of the model's parameters, i.e. when gradually unfreezing.

In [261]:
class Optimizer():
    def __init__(self, parameters, lr=0.5):
        self.parameters = parameters
        self.lr = lr
    
    def step(self):
        with torch.no_grad():
            for p in self.parameters:
                p -= p.grad * lr
    
    def zero_grad(self):
        for p in self.parameters:
            p.grad.data.zero_()

In [262]:
model = nn.Sequential(*layers)

In [263]:
opt = Optimizer(model.parameters())

In [264]:
def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs):
            
            start_idx = i * bs
            end_idx = start_idx + bs
            x_batch = x_train[start_idx:end_idx]
            y_batch = y_train[start_idx:end_idx]
            
            loss = loss_func(model(x_batch), y_batch)
            
            loss.backward()
            opt.step()
            opt.zero_grad()

In [265]:
fit()

In [266]:
loss_func(model(x_valid), y_valid)

tensor(0.1361, grad_fn=<NllLossBackward>)

In [267]:
accuracy(model(x_valid), y_valid)

tensor(0.9615)

**Of course PyTorch already provides this functionality!**

In [268]:
#export
from torch import optim

In [271]:
optim.SGD.step??

PyTorch's `optim.SGD` applies the updates for each layer group at a time:

```
for group in self.param_groups:
    weight_decay = group['weight_decay']
    momentum = group['momentum']
    dampening = group['dampening']
    nesterov = group['nesterov']
    
    # ...
    for p in group['params']:
       d_p = p.grad.data
       # ...
       p.data.add_(-group['lr'], d_p)
```

In [296]:
def get_model():
    model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))
    return model, optim.SGD(model.parameters(), lr=lr)

In [297]:
model, opt = get_model()

In [298]:
loss_func(model(x_valid), y_valid)

tensor(2.2969, grad_fn=<NllLossBackward>)

In [299]:
fit()

In [300]:
loss_func(model(x_valid), y_valid)

tensor(0.1785, grad_fn=<NllLossBackward>)

In [304]:
acc = accuracy(model(x_valid), y_valid); acc

tensor(0.9489)

In [305]:
assert acc>0.7

## Dataset and DataLoader