# AWD-LSTM

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export 
from exp.nb_12 import *

## Data

In [3]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [4]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [5]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [6]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [7]:
ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok, proc_num])

In [8]:
pickle.dump(ll, open(path/'ll_lm.pkl', 'wb'))
pickle.dump(proc_num.vocab, open(path/'vocab_lm.pkl', 'wb'))

In [9]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [10]:
bs, bptt = 64, 70

In [11]:
data = lm_databunchify(ll, bs, bptt)

## AWD-LSTM
![](pictures/lstm.jpg)

To take advantage of the GPU we do one matrix multiplication and split the output into 4 chunks for the 4 gates instead of doing 4 matrix multiplications.

### LSTM from scratch

In [12]:
class LSTMCell(nn.Module):
    def __init__(self, ni, nh):
        super().__init__()
        self.ih = nn.Linear(ni, 4*nh)
        self.hh = nn.Linear(nh, 4*nh)
        
    def forward(self, input, state):
        h, c = state  # (64, 300) each
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)  # (64, 1200) -> 4x * (64, 300) because args of chunk are (chunks, dim)
        
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate * c) + (ingate * cellgate)  # (64, 300)
        h = outgate * c.tanh()                      # (64, 300)
        
        return h, (h,c)

In [13]:
class LSTMLayer(nn.Module):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = LSTMCell(*cell_args)
        
        
    def forward(self, input, state):
        # input (64, 70, 300), state both (64, 300)
        inputs = input.unbind(1)  # (64, 70, 300) -> tuple of len 70 with shape (64, 300)
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state  # (64, 70, 300) and 2 tuple of shapes (64, 300)

In [14]:
lstm = LSTMLayer(LSTMCell, 300, 300)

In [15]:
x = torch.randn(64, 70, 300)

In [16]:
h = (torch.zeros(64, 300), torch.zeros(64, 300))

CPU:

In [17]:
%timeit -n 10 y, h1 = lstm(x, h)

80.2 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


GPU:

In [18]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [19]:
def time_fn(f):
    f()
    torch.cuda.synchronize()

In [20]:
f = partial(lstm, x, h)
time_fn(f)

In [21]:
%timeit -n 10 time_fn(f)

17 ms ± 68.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Builtin LSTM version

In [22]:
lstm = nn.LSTM(300, 300, 1, batch_first=True)

In [23]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(1, 64, 300), torch.zeros(1, 64, 300))

CPU:

In [24]:
%timeit -n 10 y, h1 = lstm(x,h)

74.5 ms ± 703 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [26]:
f = partial(lstm, x, h)
time_fn(f)

In [27]:
%timeit -n 10 time_fn(f)

1.75 ms ± 19 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


CPU versions almost has the same speed. On the GPU, however, PyTorch uses CuDNN behind the scenes which greatly optimized the for loop.

### Jit version

In [28]:
import torch.jit as jit
from torch import Tensor

In [29]:
class LSTMCell(jit.ScriptModule):
    def __init__(self, ni, nh):
        super().__init__()
        self.ni = ni
        self.nh = nh
        self.w_ih = nn.Parameter(torch.randn(4 * nh, ni))
        self.w_hh = nn.Parameter(torch.randn(4 * nh, nh))
        self.bias_ih = nn.Parameter(torch.randn(4 * nh))
        self.bias_hh = nn.Parameter(torch.randn(4 * nh))

    @jit.script_method
    def forward(self, input:Tensor, state:Tuple[Tensor, Tensor])->Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = state
        gates = (input @ self.w_ih.t() + self.bias_ih +
                 hx @ self.w_hh.t() + self.bias_hh)
        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

        ingate = torch.sigmoid(ingate)
        forgetgate = torch.sigmoid(forgetgate)
        cellgate = torch.tanh(cellgate)
        outgate = torch.sigmoid(outgate)

        cy = (forgetgate * cx) + (ingate * cellgate)
        hy = outgate * torch.tanh(cy)

        return hy, (hy, cy)

In [30]:
class LSTMLayer(jit.ScriptModule):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = cell(*cell_args)

    @jit.script_method
    def forward(self, input:Tensor, state:Tuple[Tensor, Tensor])->Tuple[Tensor, Tuple[Tensor, Tensor]]:
        inputs = input.unbind(1)
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state

In [31]:
lstm = LSTMLayer(LSTMCell, 300, 300)

In [32]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(64, 300),torch.zeros(64, 300))

In [33]:
%timeit -n 10 y,h1 = lstm(x,h)

76.1 ms ± 3.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [35]:
f = partial(lstm,x,h)
time_fn(f)

In [36]:
%timeit -n 10 time_fn(f)

5.35 ms ± 18.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


CPU version again has the same speed but with the jit version we almost have the same speed from scratch as CuDNN!

### Dropout
#### RNN Dropout

In [37]:
#export
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

In [38]:
x = torch.randn(10, 10)

The weights that are not nullified are corrected by a factor of 1-p:

In [39]:
mask = dropout_mask(x, (10, 10), 0.5); mask

tensor([[0., 0., 2., 2., 2., 0., 2., 0., 0., 2.],
        [0., 0., 0., 2., 2., 2., 2., 0., 2., 2.],
        [0., 2., 0., 2., 2., 0., 0., 2., 0., 0.],
        [2., 2., 0., 2., 0., 2., 2., 2., 2., 0.],
        [2., 0., 0., 0., 2., 0., 0., 2., 2., 0.],
        [2., 2., 0., 0., 0., 2., 2., 0., 0., 2.],
        [2., 0., 0., 0., 2., 2., 0., 0., 0., 0.],
        [2., 0., 0., 2., 2., 2., 0., 2., 0., 2.],
        [2., 0., 0., 0., 0., 2., 2., 2., 2., 0.],
        [0., 2., 2., 2., 0., 2., 0., 0., 0., 2.]])

Applying the dropout mask is simply done by `x * mask`.
Why don't we use PyTorch's dropout? We do not want to nullify all the coefficients randomly: on the sequence dimension we want to always nullify the same positions.

In [40]:
mask.sum()/mask.nelement()

tensor(1.)

In [41]:
(x*mask).std(), x.std()

(tensor(1.3800), tensor(1.0047))

Inside the RNN, the tensors have the shape (bs, seq_len, vocab_size). We want to apply the dropout mask across the seq_len dimension, we therefore create a dropout mask for the first and third demension and broadcast it along the seq_len dimension:

In [42]:
#export
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    
    def forward(self, x):
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        
        return x * m

In [43]:
dp = RNNDropout(0.3)
test_input = torch.randn(3, 3, 7)
test_input, dp(test_input).transpose(0,1)  
# transpose to make the seq_len dim come first and visualize that always same fields are nullified

(tensor([[[-3.7710e-01, -2.5349e-01,  2.2629e-01,  6.8743e-01, -9.6609e-01,
           -5.2100e-01, -9.9520e-01],
          [-4.0185e-01, -7.4007e-01, -7.4399e-01, -4.4808e-01, -4.6053e-01,
           -1.2265e+00, -5.5066e-02],
          [ 5.0986e-01, -1.2327e+00, -1.0816e+00,  2.4582e+00, -7.1526e-02,
           -5.3962e-01,  1.8693e+00]],
 
         [[ 5.9532e-01,  6.5484e-01, -2.0241e+00,  1.5686e+00, -8.0004e-02,
            3.3530e-01,  2.8444e-01],
          [-8.5892e-01, -5.0983e-01, -7.2017e-02, -1.6174e+00,  4.4273e-03,
           -3.9406e-01, -1.5108e-01],
          [ 1.2666e-03, -1.9154e+00,  6.2338e-01,  1.1196e+00,  1.0445e+00,
            9.8826e-01,  1.6622e-01]],
 
         [[ 7.6192e-01,  8.0873e-01, -6.6533e-01, -2.0237e-01, -6.9603e-01,
           -5.4338e-01, -1.6098e-01],
          [ 1.1611e+00,  1.4902e+00,  1.1338e+00,  1.4571e+00,  4.7808e-01,
            5.1317e-01, -2.4097e+00],
          [-4.3636e-01,  1.9259e+00, -1.2906e+00,  7.9552e-01,  2.1241e-01,
      

#### Weight Dropout
Weight Dropout is applied to the weights of the inner LSTM hidden to hidden matrix. Hacky if we want to preserve the CuDNN speed and not reimplement cell from scracth. We add parameter that will contain the raw weights and we replace the weight matrix in the LSTM at the beginnning of each forward pass.

[Use this instead?](https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/weight_drop.html)

In [44]:
#export
import warnings

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)

    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

In [67]:
module = nn.LSTM(5, 2)  # input_sz, hid_sz

In [68]:
dp_module = WeightDropout(module, 0.4)

**Before applying module:**

In [69]:
getattr(dp_module.module, WEIGHT_HH)

Parameter containing:
tensor([[ 0.4233,  0.2870],
        [-0.4958, -0.3826],
        [-0.2724,  0.6627],
        [-0.0984,  0.4298],
        [-0.4701, -0.0752],
        [-0.3281, -0.6622],
        [ 0.1990,  0.1084],
        [ 0.1256, -0.2499]], requires_grad=True)

In [70]:
getattr(dp_module, f'{WEIGHT_HH}_raw')

Parameter containing:
tensor([[ 0.4233,  0.2870],
        [-0.4958, -0.3826],
        [-0.2724,  0.6627],
        [-0.0984,  0.4298],
        [-0.4701, -0.0752],
        [-0.3281, -0.6622],
        [ 0.1990,  0.1084],
        [ 0.1256, -0.2499]], requires_grad=True)

**After applying module:**

In [71]:
test_input = torch.randn(4, 20, 5)  # bs, seq_len, hid_sz

In [72]:
h = (torch.zeros(1, 20, 2), torch.zeros(1, 20, 2))

In [73]:
x, h = dp_module(test_input, h)

In [74]:
getattr(dp_module.module, WEIGHT_HH) * (1 - 0.4)

tensor([[ 0.4233,  0.2870],
        [-0.4958, -0.3826],
        [-0.2724,  0.0000],
        [-0.0984,  0.0000],
        [-0.0000, -0.0752],
        [-0.3281, -0.0000],
        [ 0.1990,  0.0000],
        [ 0.1256, -0.2499]], grad_fn=<MulBackward0>)

In [75]:
x, h = dp_module(test_input, h)

In [76]:
getattr(dp_module.module, WEIGHT_HH) * (1 - 0.4)

tensor([[ 0.0000,  0.2870],
        [-0.0000, -0.3826],
        [-0.2724,  0.6627],
        [-0.0984,  0.4298],
        [-0.4701, -0.0752],
        [-0.3281, -0.6622],
        [ 0.1990,  0.0000],
        [ 0.1256, -0.0000]], grad_fn=<MulBackward0>)

Every time the `dp_module` is called different fields of the weight matrix are nullified.

Original is unchanged

In [78]:
getattr(dp_module, f'{WEIGHT_HH}_raw')

Parameter containing:
tensor([[ 0.4233,  0.2870],
        [-0.4958, -0.3826],
        [-0.2724,  0.6627],
        [-0.0984,  0.4298],
        [-0.4701, -0.0752],
        [-0.3281, -0.6622],
        [ 0.1990,  0.1084],
        [ 0.1256, -0.2499]], requires_grad=True)

#### Embedding Dropout
Applies dropout to full rows of the embedding matrix (zeroes embedding for specific words).

In [61]:
class EmbeddingDropout(nn.Module):
    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb, self.embed_p = emb, embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1
            
    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0), 1)  # (100, 1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            # list of 100 numbers being eiter 0 or 2
            masked_embed = self.emb.weight * mask  # some words are zeroed
        
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

```
Docstring:
A simple lookup table that looks up embeddings in a fixed dictionary and size.

This module is often used to retrieve word embeddings using indices.
The input to the module is a list of indices, and the embedding matrix,
and the output is the corresponding word embeddings.
```

In [62]:
enc = nn.Embedding(100, 7, padding_idx=1)  # 100 embeddings of size 7

In [63]:
enc_dp = EmbeddingDropout(enc, 0.5)

In [64]:
test_input = torch.randint(0, 100, (3,))

In [65]:
test_input

tensor([66, 40, 73])

In [66]:
enc_dp(test_input)

tensor([[-0.0000,  0.0000, -0.0000, -0.0000,  0.0000, -0.0000, -0.0000],
        [-0.0876,  0.5553,  1.6470, -3.2124,  0.8831,  1.6428, -1.9279],
        [-1.2904,  0.6507, -2.7681,  0.0934,  0.6817,  3.0076, -1.8339]],
       grad_fn=<EmbeddingBackward>)

Zeroes embedding for specific words (emb_sz = 7).

### AWD-LSTM Main Model
Regular multilayer LSTM with all those kinds of dropout.