In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.text import *

In [3]:
EOS = '<eos>'
PATH=Path('../data/wikitext')

Small helper function to read the tokens.

In [4]:
def read_file(filename):
    tokens = []
    with open(PATH/filename, encoding='utf8') as f:
        for line in f:
            tokens.append(line.split() + [EOS])
    return np.array(tokens)

In [5]:
trn_tok = read_file('wiki.train.tokens')
val_tok = read_file('wiki.valid.tokens')
tst_tok = read_file('wiki.test.tokens')

In [6]:
len(trn_tok), len(val_tok), len(tst_tok)

(36718, 3760, 4358)

In [7]:
' '.join(trn_tok[4][:20])

'The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II'

In [8]:
cnt = Counter(word for sent in trn_tok for word in sent)
cnt.most_common(10)

[('the', 113161),
 (',', 99913),
 ('.', 73388),
 ('of', 56889),
 ('<unk>', 54625),
 ('and', 50603),
 ('in', 39453),
 ('to', 39190),
 ('<eos>', 36718),
 ('a', 34237)]

Give an id to each token and add the pad token (just in case we need it).

In [9]:
itos = [o for o,c in cnt.most_common()]
itos.insert(0,'<pad>')

In [10]:
vocab_size = len(itos); vocab_size

33279

Creates the mapping from token to id then numericalizing our datasets.

In [11]:
stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)})

In [12]:
trn_ids = np.array([([stoi[w] for w in s]) for s in trn_tok])
val_ids = np.array([([stoi[w] for w in s]) for s in val_tok])
tst_ids = np.array([([stoi[w] for w in s]) for s in tst_tok])

## Testing WeightDropout

Create a bunch of parameters for deterministic tests.

In [122]:
module = nn.LSTM(20, 20)
tst_input = torch.randn(2,5,20)
tst_output = torch.randint(0,20,(10,)).long()
save_params = {}
for n,p in module._parameters.items(): save_params[n] = p.clone()

### Old WeightDropout

In [131]:
module = nn.LSTM(20, 20)
for n,p in save_params.items(): module._parameters[n] = nn.Parameter(p.clone())
dp_module = WeightDrop(module, 0.5)
opt = optim.SGD(dp_module.parameters(), 10)
dp_module.train()

WeightDrop(
  (module): LSTM(20, 20)
)

In [132]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [133]:
x = tst_input.clone()
x.requires_grad_(requires_grad=True)
h = (torch.zeros(1,5,20), torch.zeros(1,5,20))
for _ in range(5): x,h = dp_module(x,h)

In [134]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module.module,'weight_hh_l0_raw')

(tensor([[-0.0000,  0.2960,  0.0000,  ...,  0.0000, -0.0000, -0.2676],
         [ 0.0000,  0.0000,  0.1761,  ..., -0.1233,  0.3515,  0.2500],
         [ 0.0000, -0.0000, -0.2828,  ...,  0.0000, -0.0000,  0.0000],
         ...,
         [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.3393,  0.0921],
         [ 0.0000, -0.0000,  0.1991,  ...,  0.4160, -0.0000,  0.0000],
         [-0.1975,  0.0776, -0.0000,  ..., -0.0000,  0.0000,  0.0000]],
        grad_fn=<DropoutBackward>), Parameter containing:
 tensor([[-0.0017,  0.1480,  0.0863,  ...,  0.0488, -0.1722, -0.1338],
         [ 0.2046,  0.0867,  0.0880,  ..., -0.0616,  0.1757,  0.1250],
         [ 0.0520, -0.1550, -0.1414,  ...,  0.0677, -0.2110,  0.1627],
         ...,
         [-0.2192,  0.0924, -0.1362,  ...,  0.1746,  0.1697,  0.0461],
         [ 0.0708, -0.1189,  0.0996,  ...,  0.2080, -0.1703,  0.0059],
         [-0.0987,  0.0388, -0.1416,  ..., -0.0332,  0.0853,  0.1414]],
        requires_grad=True))

In [127]:
target = tst_output.clone()
loss = F.nll_loss(x.view(-1,20), target)
loss.backward()
opt.step()

In [128]:
w, w_raw = getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module.module,'weight_hh_l0_raw')
w.grad, w_raw.grad

(None, tensor([[-0.0000, -0.0001, -0.0001,  ..., -0.0000, -0.0000,  0.0001],
         [-0.0000, -0.0000, -0.0000,  ..., -0.0001, -0.0002,  0.0000],
         [ 0.0002, -0.0000,  0.0001,  ..., -0.0001,  0.0002,  0.0001],
         ...,
         [ 0.0001,  0.0001,  0.0002,  ..., -0.0001,  0.0000, -0.0000],
         [-0.0002,  0.0000, -0.0017,  ..., -0.0001,  0.0001,  0.0003],
         [-0.0001,  0.0001, -0.0003,  ...,  0.0001, -0.0004,  0.0002]]))

In [129]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module.module,'weight_hh_l0_raw')

(tensor([[-0.0000,  0.2960,  0.0000,  ...,  0.0000, -0.0000, -0.2676],
         [ 0.0000,  0.0000,  0.1761,  ..., -0.1233,  0.3515,  0.2500],
         [ 0.0000, -0.0000, -0.2828,  ...,  0.0000, -0.0000,  0.0000],
         ...,
         [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.3393,  0.0921],
         [ 0.0000, -0.0000,  0.1991,  ...,  0.4160, -0.0000,  0.0000],
         [-0.1975,  0.0776, -0.0000,  ..., -0.0000,  0.0000,  0.0000]],
        grad_fn=<DropoutBackward>), Parameter containing:
 tensor([[-0.0014,  0.1486,  0.0869,  ...,  0.0492, -0.1719, -0.1344],
         [ 0.2048,  0.0869,  0.0884,  ..., -0.0610,  0.1774,  0.1247],
         [ 0.0502, -0.1547, -0.1419,  ...,  0.0682, -0.2130,  0.1622],
         ...,
         [-0.2202,  0.0917, -0.1387,  ...,  0.1755,  0.1695,  0.0464],
         [ 0.0726, -0.1192,  0.1170,  ...,  0.2085, -0.1711,  0.0032],
         [-0.0979,  0.0376, -0.1388,  ..., -0.0343,  0.0894,  0.1396]],
        requires_grad=True))

### New WeightDropout

In [135]:
class WeightDropout(nn.Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."
    
    def __init__(self, module, dropout, layer_names=['weight_hh_l0']):
        super().__init__()
        self.module,self.dropout,self.layer_names = module,dropout,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
    
    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.dropout, training=self.training)
            
    def forward(self, *args):
        self._setweights()
        return self.module.forward(*args)
    
    def reset(self):
        if hasattr(self.module, 'reset'): self.module.reset()

In [136]:
module = nn.LSTM(20, 20)
for n,p in save_params.items(): module._parameters[n] = nn.Parameter(p.clone())
dp_module = WeightDropout(module, 0.5)
opt = optim.SGD(dp_module.parameters(), 10)
dp_module.train()

WeightDropout(
  (module): LSTM(20, 20)
)

In [137]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [138]:
x = tst_input.clone()
x.requires_grad_(requires_grad=True)
h = (torch.zeros(1,5,20), torch.zeros(1,5,20))
for _ in range(5): x,h = dp_module(x,h)

In [139]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

(tensor([[-0.0000,  0.2960,  0.0000,  ...,  0.0000, -0.0000, -0.2676],
         [ 0.0000,  0.0000,  0.1761,  ..., -0.1233,  0.3515,  0.2500],
         [ 0.0000, -0.0000, -0.2828,  ...,  0.0000, -0.0000,  0.0000],
         ...,
         [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.3393,  0.0921],
         [ 0.0000, -0.0000,  0.1991,  ...,  0.4160, -0.0000,  0.0000],
         [-0.1975,  0.0776, -0.0000,  ..., -0.0000,  0.0000,  0.0000]],
        grad_fn=<DropoutBackward>), Parameter containing:
 tensor([[-0.0017,  0.1480,  0.0863,  ...,  0.0488, -0.1722, -0.1338],
         [ 0.2046,  0.0867,  0.0880,  ..., -0.0616,  0.1757,  0.1250],
         [ 0.0520, -0.1550, -0.1414,  ...,  0.0677, -0.2110,  0.1627],
         ...,
         [-0.2192,  0.0924, -0.1362,  ...,  0.1746,  0.1697,  0.0461],
         [ 0.0708, -0.1189,  0.0996,  ...,  0.2080, -0.1703,  0.0059],
         [-0.0987,  0.0388, -0.1416,  ..., -0.0332,  0.0853,  0.1414]],
        requires_grad=True))

In [140]:
target = tst_output.clone()
loss = F.nll_loss(x.view(-1,20), target)
loss.backward()
opt.step()

In [141]:
w, w_raw = getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')
w.grad, w_raw.grad

(None, tensor([[-0.0000, -0.0001, -0.0001,  ..., -0.0000, -0.0000,  0.0001],
         [-0.0000, -0.0000, -0.0000,  ..., -0.0001, -0.0002,  0.0000],
         [ 0.0002, -0.0000,  0.0001,  ..., -0.0001,  0.0002,  0.0001],
         ...,
         [ 0.0001,  0.0001,  0.0002,  ..., -0.0001,  0.0000, -0.0000],
         [-0.0002,  0.0000, -0.0017,  ..., -0.0001,  0.0001,  0.0003],
         [-0.0001,  0.0001, -0.0003,  ...,  0.0001, -0.0004,  0.0002]]))

In [143]:
getattr(dp_module.module, 'weight_hh_l0'),getattr(dp_module,'weight_hh_l0_raw')

(tensor([[-0.0000,  0.2960,  0.0000,  ...,  0.0000, -0.0000, -0.2676],
         [ 0.0000,  0.0000,  0.1761,  ..., -0.1233,  0.3515,  0.2500],
         [ 0.0000, -0.0000, -0.2828,  ...,  0.0000, -0.0000,  0.0000],
         ...,
         [-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.3393,  0.0921],
         [ 0.0000, -0.0000,  0.1991,  ...,  0.4160, -0.0000,  0.0000],
         [-0.1975,  0.0776, -0.0000,  ..., -0.0000,  0.0000,  0.0000]],
        grad_fn=<DropoutBackward>), Parameter containing:
 tensor([[-0.0014,  0.1486,  0.0869,  ...,  0.0492, -0.1719, -0.1344],
         [ 0.2048,  0.0869,  0.0884,  ..., -0.0610,  0.1774,  0.1247],
         [ 0.0502, -0.1547, -0.1419,  ...,  0.0682, -0.2130,  0.1622],
         ...,
         [-0.2202,  0.0917, -0.1387,  ...,  0.1755,  0.1695,  0.0464],
         [ 0.0726, -0.1192,  0.1170,  ...,  0.2085, -0.1711,  0.0032],
         [-0.0979,  0.0376, -0.1388,  ..., -0.0343,  0.0894,  0.1396]],
        requires_grad=True))

## Testing EmbeddingDropout

Create a bunch of parameters for deterministic tests.

In [299]:
enc = nn.Embedding(100,20, padding_idx=0)
tst_input = torch.randint(0,100,(25,)).long()
save_params = enc.weight.clone()

### Old EmbeddingDropout

In [300]:
enc = nn.Embedding(100,20, padding_idx=0)
enc.weight = nn.Parameter(save_params.clone())
enc_dp = EmbeddingDropout(enc)

In [301]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [302]:
x = tst_input.clone()
enc_dp(x, dropout=0.5)

tensor([[ 2.2839,  0.2721,  0.0570,  0.0656,  1.4972,  0.2437,  1.8152, -2.6273,
          1.2273,  1.1763,  0.5037,  4.9464,  3.2505,  3.5982,  1.2784, -0.7848,
         -1.1731, -1.4113,  0.7130, -2.2369],
        [ 0.3309, -1.4314,  2.4850, -2.6664,  0.5921, -0.1873,  3.9878, -1.4402,
          1.9089,  0.2984,  3.2444,  0.3894, -3.1906,  1.1718, -2.1071, -2.0537,
          0.5844,  2.9225,  2.0132,  1.9562],
        [-2.8601,  0.7630,  0.6362,  2.8698,  1.2215,  0.0995,  2.7534, -0.8016,
         -0.7123, -1.2020, -0.9800,  3.4296, -0.7056, -0.7452, -1.9312,  3.8816,
         -2.9207,  3.0680,  0.9727, -0.3739],
        [-0.2838, -3.5902,  0.4280, -0.5449,  0.6160, -1.7927,  0.9662, -3.0062,
         -0.4531,  1.5771,  1.3366,  0.9028, -1.7587, -1.4735, -2.4497, -0.2795,
          1.9607,  0.1777,  0.2684,  1.2090],
        [ 0.0000, -0.0000, -0.0000, -0.0000,  0.0000, -0.0000,  0.0000, -0.0000,
          0.0000, -0.0000, -0.0000,  0.0000, -0.0000, -0.0000,  0.0000,  0.0000,
      

### New EmbeddingDropout

In [303]:
def dropout_mask(x, sz, p):
    "Returns a dropout mask of the same type as x, size sz, with probability p to cancel an element."
    return x.new(*sz).bernoulli_(1-p)/(1-p)

In [305]:
class EmbeddingDropout1(nn.Module):

    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb, dropout):
        super().__init__()
        self.emb,self.dropout = emb,dropout
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words, dropout=0.1, scale=None):
        if self.training and self.dropout != 0:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, self.dropout)
            masked_emb_weight = mask * self.emb.weight
        else: masked_emb_weight = self.emb.weight
        if scale: masked_emb_weight = scale * masked_emb_weight
        return F.embedding(words, masked_emb_weight, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [306]:
enc = nn.Embedding(100,20, padding_idx=0)
enc.weight = nn.Parameter(save_params.clone())
enc_dp = EmbeddingDropout1(enc, 0.5)

In [307]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [308]:
x = tst_input.clone()
enc_dp(x)

tensor([[ 2.2839,  0.2721,  0.0570,  0.0656,  1.4972,  0.2437,  1.8152, -2.6273,
          1.2273,  1.1763,  0.5037,  4.9464,  3.2505,  3.5982,  1.2784, -0.7848,
         -1.1731, -1.4113,  0.7130, -2.2369],
        [ 0.3309, -1.4314,  2.4850, -2.6664,  0.5921, -0.1873,  3.9878, -1.4402,
          1.9089,  0.2984,  3.2444,  0.3894, -3.1906,  1.1718, -2.1071, -2.0537,
          0.5844,  2.9225,  2.0132,  1.9562],
        [-2.8601,  0.7630,  0.6362,  2.8698,  1.2215,  0.0995,  2.7534, -0.8016,
         -0.7123, -1.2020, -0.9800,  3.4296, -0.7056, -0.7452, -1.9312,  3.8816,
         -2.9207,  3.0680,  0.9727, -0.3739],
        [-0.2838, -3.5902,  0.4280, -0.5449,  0.6160, -1.7927,  0.9662, -3.0062,
         -0.4531,  1.5771,  1.3366,  0.9028, -1.7587, -1.4735, -2.4497, -0.2795,
          1.9607,  0.1777,  0.2684,  1.2090],
        [ 0.0000, -0.0000, -0.0000, -0.0000,  0.0000, -0.0000,  0.0000, -0.0000,
          0.0000, -0.0000, -0.0000,  0.0000, -0.0000, -0.0000,  0.0000,  0.0000,
      

## Testing RNN model

Creating a bunch of parameters for deterministic testing.

In [332]:
tst_model = get_language_model(500, 20, 100, 2, 0, bias=True)
save_parameters = {}
for n,p in tst_model.state_dict().items(): save_parameters[n] = p.clone()
tst_input = torch.randint(0, 500, (10,5)).long()
tst_output = torch.randint(0, 500, (50,)).long()

### Old RNN model

In [358]:
tst_model = get_language_model(500, 20, 100, 2, 0, bias=True, dropout=0.4, dropoute=0.1, dropouth=0.2, 
                               dropouti=0.6, wdrop=0.5)
state_dict = OrderedDict()
for n,p in save_parameters.items(): state_dict[n] = p.clone()
tst_model.load_state_dict(state_dict)
opt = optim.SGD(tst_model.parameters(), lr=10)

In [359]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [360]:
x = tst_input.clone()
z = tst_model(x)
z

(tensor([[ 0.0005, -0.0192, -0.0097,  ...,  0.0120, -0.0285,  0.0068],
         [-0.0041, -0.0030, -0.0108,  ...,  0.0085, -0.0356,  0.0088],
         [-0.0015, -0.0161, -0.0064,  ...,  0.0086, -0.0198, -0.0061],
         ...,
         [ 0.0008, -0.0327, -0.0184,  ...,  0.0314, -0.0460, -0.0368],
         [-0.0005, -0.0168, -0.0430,  ...,  0.0091, -0.0292, -0.0235],
         [ 0.0047, -0.0315, -0.0145,  ...,  0.0085, -0.0557, -0.0512]],
        grad_fn=<ViewBackward>),
 [tensor([[[-0.0070,  0.0093, -0.0127,  ..., -0.0127,  0.0019, -0.0122],
           [-0.0068,  0.0088, -0.0119,  ..., -0.0070,  0.0131, -0.0127],
           [-0.0110,  0.0003, -0.0091,  ..., -0.0199,  0.0036, -0.0174],
           [-0.0074,  0.0093, -0.0167,  ..., -0.0118,  0.0063, -0.0096],
           [-0.0122,  0.0024, -0.0095,  ..., -0.0059,  0.0143, -0.0146]],
  
          [[-0.0131,  0.0122, -0.0341,  ..., -0.0250,  0.0075, -0.0314],
           [-0.0305,  0.0160, -0.0145,  ..., -0.0261,  0.0096, -0.0248],
           

In [361]:
y = tst_output.clone()
loss = F.nll_loss(z[0], y)
loss.backward()
opt.step()

In [362]:
tst_model[0].rnns[0].module._parameters['weight_hh_l0_raw']

Parameter containing:
tensor([[-0.0771, -0.0582, -0.0595,  ..., -0.0704,  0.0910, -0.0049],
        [ 0.0232,  0.0282,  0.0036,  ...,  0.0123,  0.0911, -0.0684],
        [ 0.0752, -0.0123,  0.0452,  ..., -0.0631, -0.0344,  0.0208],
        ...,
        [-0.0841, -0.0302,  0.0260,  ...,  0.0346, -0.0877, -0.0890],
        [-0.0016,  0.0072,  0.0175,  ...,  0.0442,  0.0095,  0.0981],
        [ 0.0968,  0.0940, -0.0924,  ..., -0.0098,  0.0201, -0.0284]],
       requires_grad=True)

### New RNN model

In [363]:
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p=p

    def forward(self, x):
        if not self.training or not self.p: return x
        m = dropout_mask(x.data, (1, x.size(1), x.size(2)), self.p)
        return m * x

In [364]:
def repackage_var1(h):
    "Detaches h from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(repackage_var(v) for v in h)

In [365]:
class RNNCore(nn.Module):
    "AWD-LSTM/QRNN inspired by https://arxiv.org/abs/1708.02182"

    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token, bidir=False,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5, qrnn=False):
        
        super().__init__()
        self.bs,self.qrnn,self.ndir = 1, qrnn,(2 if bidir else 1)
        self.emb_sz,self.n_hid,self.n_layers = emb_sz,n_hid,n_layers
        self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.dp_encoder = EmbeddingDropout1(self.encoder, embed_p)
        if self.qrnn:
            #Using QRNN requires cupy: https://github.com/cupy/cupy
            from .torchqrnn.qrnn import QRNNLayer
            self.rnns = [QRNNLayer(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True) for l in range(n_layers)]
            if weight_p != 0.:
                for rnn in self.rnns:
                    rnn.linear = WeightDropout(rnn.linear, weight_p, layer_names=['weight'])
        else:
            self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                1, bidirectional=bidir) for l in range(n_layers)]
            if weight_p != 0.: self.rnns = [WeightDropout(rnn, weight_p) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropouti = RNNDropout(input_p)
        self.dropouths = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input):
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.dropouti(self.dp_encoder(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):
            with warnings.catch_warnings():
                #To avoid the warning that comes because the weights aren't flattened.
                warnings.simplefilter("ignore")
                raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = drop(raw_output)
            outputs.append(raw_output)
        self.hidden = repackage_var1(new_hidden)
        return raw_outputs, outputs

    def one_hidden(self, l):
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        [r.reset() for r in self.rnns if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self.one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.n_layers)]

In [366]:
class LinearDecoder1(nn.Module):
    "To go on top of a RNN_Core module"
    
    initrange=0.1
    
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True):
        super().__init__()
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropout = RNNDropout(output_p)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.dropout(outputs[-1])
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [367]:
class SequentialRNN1(nn.Sequential):
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

In [368]:
def get_language_model1(vocab_sz, emb_sz, n_hid, n_layers, pad_token, tie_weights=True, qrnn=False, bias=True,
                 output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
    "To create a full AWD-LSTM"
    rnn_enc = RNNCore(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token, qrnn=qrnn,
                 hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = rnn_enc.encoder if tie_weights else None
    return SequentialRNN1(rnn_enc, LinearDecoder1(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

The new model has weights that are organized a bit differently.

In [369]:
save_parameters1 = {}
for n,p in save_parameters.items(): 
    if 'weight_hh_l0' not in n and n!='0.encoder_with_dropout.embed.weight':  save_parameters1[n] = p.clone()
    elif n=='0.encoder_with_dropout.embed.weight': save_parameters1['0.dp_encoder.emb.weight'] = p.clone()
    else: 
        save_parameters1[n[:-4]] = p.clone()
        splits = n.split('.')
        splits.remove(splits[-2])
        n1 = '.'.join(splits)
        save_parameters1[n1] = p.clone()

In [370]:
tst_model = get_language_model1(500, 20, 100, 2, 0)
tst_model.load_state_dict(save_parameters1)
opt = optim.SGD(tst_model.parameters(), lr=10)

In [371]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [372]:
x = tst_input.clone()
z = tst_model(x)
z

(tensor([[ 0.0005, -0.0192, -0.0097,  ...,  0.0120, -0.0285,  0.0068],
         [-0.0041, -0.0030, -0.0108,  ...,  0.0085, -0.0356,  0.0088],
         [-0.0015, -0.0161, -0.0064,  ...,  0.0086, -0.0198, -0.0061],
         ...,
         [ 0.0008, -0.0327, -0.0184,  ...,  0.0314, -0.0460, -0.0368],
         [-0.0005, -0.0168, -0.0430,  ...,  0.0091, -0.0292, -0.0235],
         [ 0.0047, -0.0315, -0.0145,  ...,  0.0085, -0.0557, -0.0512]],
        grad_fn=<ThAddmmBackward>),
 [tensor([[[-0.0070,  0.0093, -0.0127,  ..., -0.0127,  0.0019, -0.0122],
           [-0.0068,  0.0088, -0.0119,  ..., -0.0070,  0.0131, -0.0127],
           [-0.0110,  0.0003, -0.0091,  ..., -0.0199,  0.0036, -0.0174],
           [-0.0074,  0.0093, -0.0167,  ..., -0.0118,  0.0063, -0.0096],
           [-0.0122,  0.0024, -0.0095,  ..., -0.0059,  0.0143, -0.0146]],
  
          [[-0.0131,  0.0122, -0.0341,  ..., -0.0250,  0.0075, -0.0314],
           [-0.0305,  0.0160, -0.0145,  ..., -0.0261,  0.0096, -0.0248],
        

In [373]:
y = tst_output.clone()
loss = F.nll_loss(z[0], y)
loss.backward()
opt.step()

In [374]:
tst_model[0].rnns[0]._parameters['weight_hh_l0_raw']

Parameter containing:
tensor([[-0.0771, -0.0582, -0.0595,  ..., -0.0704,  0.0910, -0.0049],
        [ 0.0232,  0.0282,  0.0036,  ...,  0.0123,  0.0911, -0.0684],
        [ 0.0752, -0.0123,  0.0452,  ..., -0.0631, -0.0344,  0.0208],
        ...,
        [-0.0841, -0.0302,  0.0260,  ...,  0.0346, -0.0877, -0.0890],
        [-0.0016,  0.0072,  0.0175,  ...,  0.0442,  0.0095,  0.0981],
        [ 0.0968,  0.0940, -0.0924,  ..., -0.0098,  0.0201, -0.0284]],
       requires_grad=True)

## Regularization

We'll keep the same param as before.

### Old reg

In [274]:
tst_model = get_language_model(500, 20, 100, 2, 0, bias=True, dropout=0.4, dropoute=0.1, dropouth=0.2, 
                               dropouti=0.6, wdrop=0.5)
state_dict = OrderedDict()
for n,p in save_parameters.items(): state_dict[n] = p.clone()
tst_model.load_state_dict(state_dict)
opt = optim.SGD(tst_model.parameters(), lr=10, weight_decay=1)

In [275]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [276]:
x = tst_input.clone()
z = tst_model(x)
y = tst_output.clone()
loss = F.nll_loss(z[0], y)

In [277]:
loss = seq2seq_reg(z[0], z[1:], loss, 2, 1)
loss.item()

0.01827934943139553

In [278]:
loss.backward()
nn.utils.clip_grad_norm_(tst_model.parameters(), 0.1)
opt.step()

In [279]:
tst_model[0].rnns[0].module._parameters['weight_hh_l0_raw']

Parameter containing:
tensor([[-0.8528,  0.3268, -0.1945,  ..., -0.7425, -0.5671, -0.5097],
        [ 0.0168, -0.8864,  0.7456,  ...,  0.1414, -0.6313, -0.8001],
        [-0.4075,  0.5643,  0.0247,  ...,  0.3236, -0.4726, -0.4496],
        ...,
        [-0.6785,  0.6084, -0.0387,  ...,  0.4916, -0.6366,  0.7781],
        [ 0.0696,  0.3158,  0.6227,  ...,  0.6792, -0.3027,  0.7779],
        [-0.8144,  0.5334,  0.0538,  ...,  0.3347, -0.5188,  0.1718]],
       requires_grad=True)

### New reg

In [284]:
from dataclasses import dataclass

In [289]:
@dataclass
class RNNTrainer(Callback):
    model:nn.Module
    bptt:int
    clip:float=None
    alpha:float=0.
    beta:float=0.
    
    def on_loss_begin(self, last_output, **kwargs):
        #Save the extra outputs for later and only returns the true output.
        self.raw_out,self.out = last_output[1],last_output[2]
        return last_output[0]
    
    def on_backward_begin(self, last_loss, last_input, last_output, **kwargs):
        #Adjusts the lr to the bptt selected
        #self.learn.opt.lr *= last_input.size(0) / self.bptt
        #AR and TAR
        if self.alpha != 0.:  last_loss += (self.alpha * self.out[-1].pow(2).mean()).sum()
        if self.beta != 0.:
            h = self.raw_out[-1]
            if len(h)>1: last_loss += (self.beta * (h[1:] - h[:-1]).pow(2).mean()).sum()
        return last_loss
    
    def on_backward_end(self, **kwargs):
        if self.clip:  nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)

In [290]:
save_parameters1 = {}
for n,p in save_parameters.items(): 
    if 'weight_hh_l0' not in n and n!='0.encoder_with_dropout.embed.weight':  save_parameters1[n] = p.clone()
    elif n=='0.encoder_with_dropout.embed.weight': save_parameters1['0.dp_encoder.embed.weight'] = p.clone()
    else: 
        save_parameters1[n[:-4]] = p.clone()
        splits = n.split('.')
        splits.remove(splits[-2])
        n1 = '.'.join(splits)
        save_parameters1[n1] = p.clone()

In [291]:
tst_model = get_language_model1(500, 20, 100, 2, 0)
tst_model.load_state_dict(save_parameters1)
opt = optim.SGD(tst_model.parameters(), lr=10, weight_decay=1)

In [292]:
torch.manual_seed(7)

<torch._C.Generator at 0x267b3a7b390>

In [293]:
cb = RNNTrainer(tst_model, 10, 0.1, 2, 1)

In [294]:
x = tst_input.clone()
z = tst_model(x)
y = tst_output.clone()
z = cb.on_loss_begin(z)
loss = F.nll_loss(z, y)
loss = cb.on_backward_begin(loss, x, z)
loss.item()

0.01827934943139553

In [295]:
loss.backward()
cb.on_backward_end()
opt.step()

In [297]:
tst_model[0].rnns[0]._parameters['weight_hh_l0_raw']

Parameter containing:
tensor([[-0.8528,  0.3268, -0.1945,  ..., -0.7425, -0.5671, -0.5097],
        [ 0.0168, -0.8864,  0.7456,  ...,  0.1414, -0.6313, -0.8001],
        [-0.4075,  0.5643,  0.0247,  ...,  0.3236, -0.4726, -0.4496],
        ...,
        [-0.6785,  0.6084, -0.0387,  ...,  0.4916, -0.6366,  0.7781],
        [ 0.0696,  0.3158,  0.6227,  ...,  0.6792, -0.3027,  0.7779],
        [-0.8144,  0.5334,  0.0538,  ...,  0.3347, -0.5188,  0.1718]],
       requires_grad=True)