In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from exp.nb_02 import *
import torch.nn.functional as F

## Initial Setup

### Data

In [3]:
# mpl -> Matplot lib
mpl.rcParams['image.cmap'] = 'gray'

In [4]:
x_train, y_train, x_valid, y_valid = get_data()

In [5]:
n, m = x_train.shape
c = y_train.max() + 1
nh = 50

In [6]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [7]:
model = Model(m, nh, 10)

In [8]:
pred = model(x_train)

### Cross Entropy Loss

First we need to calculate softmax (log softmax)

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$

In [9]:
def log_softmax(x):
    return (x.exp() / (x.exp().sum(dim=-1, keepdim=True))).log()

In [10]:
sm_pred = log_softmax(pred)

In [11]:
sm_pred.shape

torch.Size([50000, 10])

The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:

$$ -\sum\, \log p(x) $$

But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.

This can be done using numpy-style [integer array indexing](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#integer-array-indexing). Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.

In [12]:
y_train[:3]

tensor([5, 0, 4])

In [13]:
sm_pred[[0, 1, 2], [0, 1, 0]]

tensor([-2.1959, -2.4599, -2.2337], grad_fn=<IndexBackward>)

In [14]:
def nlloss(input, target):
    return -input[range(target.shape[0]), target].mean()

In [15]:
test_near(nlloss(sm_pred, y_train), F.nll_loss(sm_pred, y_train))

In [16]:
def nlloss2(input, target):
    return -input.gather(1, target.view(-1, 1)).mean()

In [17]:
test_near(nlloss2(sm_pred, y_train), F.nll_loss(sm_pred, y_train))

In [18]:
loss = F.nll_loss(sm_pred, y_train)

Note that the formula 

$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$ 

gives a simplification when we compute the log softmax, which was previously defined as `(x.exp()/(x.exp().sum(-1,keepdim=True))).log()`

In [19]:
def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()

In [20]:
test_near(nlloss(log_softmax(pred), y_train), loss)

Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp). The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$

where a is the maximum of the $x_{j}$.

In [23]:
pred.shape

torch.Size([50000, 10])

In [26]:
pred.max(-1)[0][:,None].shape

torch.Size([50000, 1])

In [29]:
def logsumexp(x):
    a = x.max(-1)[0]
    return a + (x - a[:, None]).exp().sum(-1).log()

In [32]:
logsumexp(pred).shape

torch.Size([50000])

In [34]:
log_softmax(pred).shape

torch.Size([50000, 10])

In [36]:
test_near(logsumexp(pred), pred.logsumexp(-1))

So we can use it for our `log_softmax` function.

In [37]:
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

In [39]:
log_softmax(pred).shape

torch.Size([50000, 10])

In [40]:
test_near(nlloss(log_softmax(pred), y_train), loss)

In [41]:
test_near(F.nll_loss(F.log_softmax(pred, -1), y_train), loss)

In [42]:
test_near(F.cross_entropy(pred, y_train), loss)

## Basic training loop

Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [44]:
loss_func = F.cross_entropy

In [47]:
def accuracy(out, yb):
    return (torch.argmax(out, dim=-1) == yb).float().mean()

In [49]:
accuracy(pred, y_train)

tensor(0.1061)

In [50]:
bs = 64

In [54]:
xb = x_train[0:bs]

In [55]:
preds = model(xb)
preds[0], preds.shape

(tensor([ 0.1399, -0.1165,  0.1881,  0.1605, -0.0111,  0.0014, -0.1793, -0.0380,
         -0.0465,  0.1599], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [57]:
yb = y_train[0:bs]
loss_func(xb, yb)

tensor(6.8455)

In [58]:
accuracy(preds, yb)

tensor(0.1094)

In [59]:
lr = 0.5
epochs = 1

In [60]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        xb = x_train[start_i: end_i]
        yb = y_train[start_i: end_i]
        loss = loss_func(model(xb), yb)
        
        loss.backward()
        
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()

In [61]:
loss_func(model(xb), yb)

tensor(0.1633, grad_fn=<NllLossBackward>)

In [62]:
accuracy(model(xb), yb)

tensor(0.9375)

## Using parameters and optim

### Parameters

Use `nn.Module.__setattr__` and move relu to functional: