In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nb_005 import *
from collections import Counter

# Wikitext 2

## Data

Download the dataset [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip) and unzip it so it's in the folder wikitext.

In [3]:
EOS = '<eos>'
PATH=Path('data/wikitext')

Small helper function to read the tokens.

In [4]:
def read_file(filename):
    tokens = []
    with open(PATH/filename, encoding='utf8') as f:
        for line in f:
            tokens.append(line.split() + [EOS])
    return np.array(tokens)

In [5]:
trn_tok = read_file('wiki.train.tokens')
val_tok = read_file('wiki.valid.tokens')
tst_tok = read_file('wiki.test.tokens')

In [6]:
len(trn_tok), len(val_tok), len(tst_tok)

(36718, 3760, 4358)

In [7]:
' '.join(trn_tok[4][:20])

'The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II'

In [8]:
cnt = Counter(word for sent in trn_tok for word in sent)
cnt.most_common(10)

[('the', 113161),
 (',', 99913),
 ('.', 73388),
 ('of', 56889),
 ('<unk>', 54625),
 ('and', 50603),
 ('in', 39453),
 ('to', 39190),
 ('<eos>', 36718),
 ('a', 34237)]

Give an id to each token and add the pad token (just in case we need it).

In [9]:
itos = [o for o,c in cnt.most_common()]
itos.insert(0,'<pad>')

In [10]:
vocab_size = len(itos); vocab_size

33279

Creates the mapping from token to id then numericalizing our datasets.

In [11]:
stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)})

In [12]:
trn_ids = np.array([([stoi[w] for w in s]) for s in trn_tok])
val_ids = np.array([([stoi[w] for w in s]) for s in val_tok])
tst_ids = np.array([([stoi[w] for w in s]) for s in tst_tok])

## Model

### 1. Dropout

We want to use the AWD-LSTM from [Stephen Merity](https://arxiv.org/abs/1708.02182). First, we'll need all different kinds of dropouts. Dropout consists into replacing some coefficients by 0 with probability p. To ensure that the averga of the weights remains constant, we apply a correction to the weights that aren't nullified of a factor `1/(1-p)`.

In [13]:
def dropout_mask(x, sz, p):
    "Returns a dropout mask of the same type as x, size sz, with probability p to cancel an element."
    return x.new(*sz).bernoulli_(1-p)/(1-p)

In [14]:
x = torch.randn(10,10)
dropout_mask(x, (10,10), 0.5)

tensor([[2., 2., 0., 2., 0., 0., 2., 0., 2., 0.],
        [2., 0., 0., 2., 0., 2., 0., 0., 0., 2.],
        [2., 0., 2., 2., 0., 0., 2., 0., 0., 2.],
        [0., 0., 2., 0., 0., 2., 2., 0., 0., 0.],
        [2., 0., 2., 0., 2., 0., 0., 2., 2., 0.],
        [0., 2., 2., 2., 0., 0., 2., 0., 2., 2.],
        [0., 0., 2., 0., 0., 0., 2., 0., 0., 0.],
        [2., 0., 2., 0., 0., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 0., 0., 0., 2., 2., 0.],
        [0., 2., 2., 2., 2., 2., 0., 0., 2., 0.]])

Once with have a dropout mask `m`, applying the dropout to `x` is simply done by `x = x * m`. We create our own dropout mask and don't rely on pytorch dropout because we want to nullify the coefficients on the batch dimension but not the token dimension (aka the same coefficients are replaced by zero for each word in the sentence). 

Inside a RNN, a tensor x will have three dimensions: seq_len, bs, vocab_size, so we create a dropout mask for the last two dimensions and broadcast it to the first dimension.

In [15]:
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p=p

    def forward(self, x):
        if not self.training or not self.p: return x
        m = dropout_mask(x.data, (1, x.size(1), x.size(2)), self.p)
        return m * x

In [16]:
dp_test = RNNDropout(0.5)
x = torch.randn(2,5,10)
x, dp_test(x)

(tensor([[[-1.9901, -0.9491, -0.2538,  0.2669, -1.0387, -0.6803, -0.1309,
            0.5409, -0.1089,  0.1139],
          [-1.5815, -0.7023,  1.0417, -0.6517,  0.1800,  1.4703,  1.1635,
            1.0649,  1.6631, -0.0459],
          [ 1.8676, -0.8832, -0.8476,  0.3524,  2.6596,  1.2432, -0.3587,
            1.3670, -0.5917,  1.0211],
          [ 0.3662,  0.0283,  0.4290,  2.7126,  0.8004,  0.8621,  0.6163,
            1.2706,  0.0633,  0.1964],
          [ 0.7371,  0.2741,  0.4601, -0.0003,  0.2714, -0.2649,  0.7640,
            1.8827, -1.1130,  1.5036]],
 
         [[-1.2806, -0.0421,  0.3034,  0.3624,  0.2683, -0.0867, -1.6087,
           -0.4795,  1.5125, -0.0765],
          [ 0.1915,  1.5651, -0.7167, -0.0130, -0.9117,  0.0379,  0.2068,
           -1.2578,  0.3432,  1.0319],
          [ 0.8134, -0.8821, -0.1735,  1.4034,  1.6443, -0.6728,  0.2942,
           -0.1264,  0.0781, -0.5670],
          [ 1.3625,  0.7245,  1.8641, -1.8053,  0.3111, -0.0901,  0.5542,
           -0.5308,

In [17]:
def noop(x): return x

In [18]:
class WeightDropout(nn.Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."
    
    def __init__(self, module, dropout, layer_names=['weight_hh_l0']):
        super().__init__()
        self.module,self.dropout,self.layer_names = module,dropout,layer_names
    
    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            w1 = F.dropout(raw_w, p=self.dropout, training=self.training)
            setattr(self.module, layer, w1)
    
    def forward(self, *args):
        self._setweights()
        return self.module.forward(*args)
    
    def reset(self):
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            del self.module._parameters[layer]
        if hasattr(self.module, 'reset'): self.module.reset()
    
    def update_raw(self):
        for layer in self.layer_names:
            w = getattr(self.module, layer)
            mask = w != 0.
            self.raw_weights[layer][mask] = w[mask] * (1-self.dropout)

In [19]:
module = nn.LSTM(20, 20)
dp_module = WeightDropout(module, 0.5)
dp_module.reset()
opt = optim.SGD(dp_module.parameters(), 10)
dp_module.train()

WeightDropout(
  (module): LSTM(20, 20)
)

In [20]:
w = F.dropout(w_raw, p=0.5, training=True)

NameError: name 'w_raw' is not defined

In [21]:
w

NameError: name 'w' is not defined

In [22]:
x = torch.randn(2,5,20)
x.requires_grad_(requires_grad=True)
h = (torch.zeros(1,5,20), torch.zeros(1,5,20))
for _ in range(5): x,h = dp_module(x,h)

RuntimeError: got an incorrect number of RNN parameters

In [64]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

(tensor([[ 0.2024, -0.1610,  0.0000,  ..., -0.1056,  0.0583, -0.0000],
         [-0.0000,  0.0000,  0.4104,  ...,  0.2743,  0.0000,  0.0000],
         [-0.0000,  0.1541,  0.0000,  ..., -0.2783,  0.0000,  0.0008],
         ...,
         [ 0.2867,  0.0000,  0.0668,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0588, -0.0000,  0.0000,  ...,  0.0000, -0.2593, -0.0000],
         [ 0.3162,  0.0000,  0.3668,  ..., -0.0000, -0.0000, -0.0000]],
        grad_fn=<DropoutBackward>), Parameter containing:
 tensor([[ 0.1012, -0.0805,  0.0896,  ..., -0.0528,  0.0291, -0.1451],
         [-0.1924,  0.2114,  0.2052,  ...,  0.1372,  0.2105,  0.1657],
         [-0.1455,  0.0771,  0.0892,  ..., -0.1391,  0.1880,  0.0004],
         ...,
         [ 0.1434,  0.2104,  0.0334,  ...,  0.2112,  0.1248,  0.1113],
         [-0.0294, -0.0133,  0.1843,  ...,  0.0759, -0.1296, -0.1526],
         [ 0.1581,  0.0990,  0.1834,  ..., -0.1259, -0.0425, -0.0921]],
        requires_grad=True))

In [65]:
target = torch.randint(0,20,(10,)).long()
loss = F.nll_loss(x.view(-1,20), target)
loss.backward()
opt.step()

In [66]:
w, w_raw = getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')
w.grad, w_raw.grad

(None, tensor([[ 0.0003,  0.0005,  0.0002,  ...,  0.0001,  0.0005,  0.0001],
         [-0.0000, -0.0003, -0.0001,  ..., -0.0000, -0.0008,  0.0002],
         [ 0.0001,  0.0002, -0.0000,  ...,  0.0001,  0.0001, -0.0001],
         ...,
         [-0.0002, -0.0000,  0.0000,  ...,  0.0000, -0.0001, -0.0000],
         [-0.0014,  0.0001,  0.0001,  ..., -0.0000, -0.0009,  0.0001],
         [-0.0000, -0.0001, -0.0001,  ...,  0.0000, -0.0000,  0.0000]]))

In [29]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

(Parameter containing:
 tensor([[ 0.1554, -0.0616,  0.2184,  ...,  0.2113,  0.0702,  0.1316],
         [-0.1992, -0.0237,  0.0526,  ...,  0.1810,  0.1886, -0.2164],
         [ 0.1710, -0.0467, -0.2082,  ...,  0.1273, -0.1579,  0.1089],
         ...,
         [ 0.0842, -0.0625, -0.1341,  ...,  0.1033,  0.1231,  0.0102],
         [-0.1119, -0.0186,  0.1200,  ...,  0.0714,  0.0843, -0.0027],
         [-0.1073,  0.1865,  0.1759,  ...,  0.0214, -0.2209,  0.0449]],
        requires_grad=True), Parameter containing:
 tensor([[ 0.1554, -0.0616,  0.2184,  ...,  0.2113,  0.0702,  0.1316],
         [-0.1992, -0.0237,  0.0526,  ...,  0.1810,  0.1886, -0.2164],
         [ 0.1710, -0.0467, -0.2082,  ...,  0.1273, -0.1579,  0.1089],
         ...,
         [ 0.0842, -0.0625, -0.1341,  ...,  0.1033,  0.1231,  0.0102],
         [-0.1119, -0.0186,  0.1200,  ...,  0.0714,  0.0843, -0.0027],
         [-0.1073,  0.1865,  0.1759,  ...,  0.0214, -0.2209,  0.0449]],
        requires_grad=True))

In [30]:
opt.param_groups

[{'params': [Parameter containing:
   tensor([[ 0.1554, -0.0616,  0.2184,  ...,  0.2113,  0.0702,  0.1316],
           [-0.1992, -0.0237,  0.0526,  ...,  0.1810,  0.1886, -0.2164],
           [ 0.1710, -0.0467, -0.2082,  ...,  0.1273, -0.1579,  0.1089],
           ...,
           [ 0.0842, -0.0625, -0.1341,  ...,  0.1033,  0.1231,  0.0102],
           [-0.1119, -0.0186,  0.1200,  ...,  0.0714,  0.0843, -0.0027],
           [-0.1073,  0.1865,  0.1759,  ...,  0.0214, -0.2209,  0.0449]],
          requires_grad=True), Parameter containing:
   tensor([[ 0.0747,  0.0088,  0.1997,  ..., -0.0098, -0.1459, -0.1907],
           [-0.1819,  0.2078, -0.0633,  ...,  0.0808, -0.0304, -0.1768],
           [ 0.1626, -0.0114, -0.0725,  ...,  0.0376,  0.0046,  0.0529],
           ...,
           [-0.1112, -0.0902,  0.0758,  ..., -0.2081, -0.2276, -0.0936],
           [ 0.0706, -0.0816,  0.2260,  ...,  0.0985,  0.2174,  0.1962],
           [-0.1049, -0.1724, -0.0193,  ...,  0.1149,  0.1946,  0.1333]],
  

In [31]:
class WeightDrop(torch.nn.Module):
    def __init__(self, module, weights=['weight_hh_l0'], dropout=0, variational=False):
        super(WeightDrop, self).__init__()
        self.module = module
        self.weights = weights
        self.dropout = dropout
        self.variational = variational
        self._setup()

    def widget_demagnetizer_y2k_edition(*args, **kwargs):
        # We need to replace flatten_parameters with a nothing function
        # It must be a function rather than a lambda as otherwise pickling explodes
        # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION!
        # (╯°□°）╯︵ ┻━┻
        return

    def _setup(self):
        # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
        if issubclass(type(self.module), torch.nn.RNNBase):
            self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

        for name_w in self.weights:
            print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
            w = getattr(self.module, name_w)
            del self.module._parameters[name_w]
            self.module.register_parameter(name_w + '_raw', nn.Parameter(w.data))

    def _setweights(self):
        for name_w in self.weights:
            raw_w = getattr(self.module, name_w + '_raw')
            w = None
            if self.variational:
                mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
                if raw_w.is_cuda: mask = mask.cuda()
                mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
                w = mask.expand_as(raw_w) * raw_w
            else:
                w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
            setattr(self.module, name_w, w)

    def forward(self, *args):
        self._setweights()
        return self.module.forward(*args)

In [32]:
x = torch.autograd.Variable(torch.randn(2, 1, 10)).cuda()
h0 = None
lin = WeightDrop(torch.nn.Linear(10, 10), ['weight'], dropout=0.9)
lin.cuda()
run1 = [x.sum() for x in lin(x).data]
run2 = [x.sum() for x in lin(x).data]

print('All items should be different')
print('Run 1:', run1)
print('Run 2:', run2)

print('Testing WeightDrop with LSTM')

wdrnn = WeightDrop(torch.nn.LSTM(10, 10), ['weight_hh_l0'], dropout=0.9)
wdrnn.cuda()

run1 = [x.sum() for x in wdrnn(x, h0)[0].data]
run2 = [x.sum() for x in wdrnn(x, h0)[0].data]

print('First timesteps should be equal, all others should differ')
print('Run 1:', run1)
print('Run 2:', run2)

print('---')

Applying weight drop of 0.9 to weight
All items should be different
Run 1: [tensor(-1.2833, device='cuda:0'), tensor(-4.5258, device='cuda:0')]
Run 2: [tensor(3.3188, device='cuda:0'), tensor(-6.7075, device='cuda:0')]
Testing WeightDrop with LSTM
Applying weight drop of 0.9 to weight_hh_l0


  result = self.forward(*input, **kwargs)


First timesteps should be equal, all others should differ
Run 1: [tensor(-0.2832, device='cuda:0'), tensor(-0.4045, device='cuda:0')]
Run 2: [tensor(-0.2832, device='cuda:0'), tensor(-0.3857, device='cuda:0')]
---


In [33]:
module = nn.LSTM(10, 20)
dp_module = WeightDrop(module, dropout=0.5)
#dp_module.reset()
opt = optim.SGD(dp_module.parameters(), 10)

Applying weight drop of 0.5 to weight_hh_l0


In [34]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

AttributeError: 'LSTM' object has no attribute 'weight_hh_l0'

In [35]:
x = torch.randn(2,5,10)
x.requires_grad_(requires_grad=True)
h = (torch.zeros(1,5,20), torch.zeros(1,5,20))
out,h = dp_module(x,h)

In [57]:
lstm = nn.LSTM(5, 3)  # Input dim is 3, output dim is 3
inputs = torch.randn(7, 2, 5)  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 2, 3),
          torch.randn(1, 2, 3))
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden state.
out, hidden = lstm(inputs.view(7, 2, -1), hidden)