In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
from torchtext import vocab, data
from fastai.nlp import LanguageModelData
#from fastai.lm_rnn import *
PATH='data/nietzsche/'
TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'
%ls {PATH}

nietzsche.txt  [0m[01;34mtrn[0m/  [01;34mval[0m/


In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


Make the first 80% of the text to be the training set, and the later 20% to be the validation set.
No need to be a random sample, as it is more likely that a test set will come from a separate corpus.

sed -n [1,7947p] nietzsche.txt > trn/trn.txt
sed -n [7950,9935p] nietzsche.txt > val/val.txt

In [4]:
#%ls {TRN} {VAL}

In [5]:
list('abc')    # We will use the function list as it naturally tokenized into characters.

['a', 'b', 'c']

In [6]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

print("minibatches:", len(md.trn_dl), "tokens:", md.nt, len(md.trn_ds), "total tokens:", len(md.trn_ds[0].text))

minibatches: 942 tokens: 55 1 total tokens: 482908


In [7]:
TEXT.vocab.itos; TEXT.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {' ': 2,
             '!': 34,
             '"': 27,
             "'": 38,
             '(': 36,
             ')': 37,
             ',': 20,
             '-': 25,
             '.': 26,
             '0': 53,
             '1': 40,
             '2': 41,
             '3': 44,
             '4': 47,
             '5': 48,
             '6': 49,
             '7': 51,
             '8': 50,
             '9': 52,
             ':': 31,
             ';': 30,
             '<eos>': 0,
             '<pad>': 1,
             '<unk>': 0,
             '=': 42,
             '?': 35,
             '[': 45,
             ']': 46,
             '_': 43,
             'a': 6,
             'b': 23,
             'c': 14,
             'd': 13,
             'e': 3,
             'f': 16,
             'g': 19,
             'h': 11,
             'i': 5,
             'j': 33,
             'k': 28,
             'l': 12,
             'm': 17,
            

In [8]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [9]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [10]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

*idx* will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)

In [11]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [12]:
''.join(indices_char[i] for i in idx[:70])  #To confirm

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [13]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]

Our inputs

In [14]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

Our output

In [15]:
y = np.stack(c4_dat[:-2])

The first 4 inputs and outputs

In [16]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [17]:
y[:4]

array([30, 29,  1, 40])

In [18]:
x1.shape, y.shape

((200295,), (200295,))

### Create and train model

Pick a size for our hidden state

In [19]:
n_hidden = 256   # Activations

The number of latent factors to create (i.e. the size of the embedding matrix)

In [20]:
n_fac = 42

In [21]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)   #THis is a squared Matrix
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [22]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [23]:
m = Char3Model(vocab_size, n_fac).cuda()

In [24]:
it = iter(md.trn_dl) #grab the iterator for the training set
*xs,yt = next(it)   #grab a minibatch
t = m(*V(xs))  #invoke the model as a function passing the tensor as Variable

In [25]:
xs[0].size()

torch.Size([512])

In [26]:
t  #for each one the probabilities of the characters (out of the 85 in the vocabulary)

Variable containing:
-4.9729 -4.2282 -4.4192  ...  -4.7960 -4.4218 -4.6513
-4.8627 -4.3578 -4.1951  ...  -4.7975 -4.4635 -4.3725
-4.5687 -4.2328 -4.6828  ...  -4.7378 -4.4484 -4.5939
          ...             ⋱             ...          
-4.9320 -4.1820 -4.2490  ...  -4.9470 -4.3115 -4.4838
-4.6505 -4.5073 -4.4544  ...  -4.6559 -4.6166 -4.5106
-4.5403 -4.4430 -4.3921  ...  -4.5420 -4.6572 -4.4357
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [27]:
opt = optim.Adam(m.parameters(), 1e-2)    #pytorch optimizer

In [28]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 2.129707, val_loss: 5.804311        



In [29]:
set_lrs(opt, 0.001)

In [30]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.865353, val_loss: 5.064150         



### Test model

In [31]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [32]:
get_next('y. ')  #pass it 3 characters

'T'

In [33]:
get_next('ppl')

'e'

In [34]:
get_next(' th')

'e'

In [35]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [36]:
cs=8    # 8 characters

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [37]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs-1)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [38]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs-1)]

In [39]:
xs = np.stack(c_in_dat, axis=0)

In [40]:
xs.shape

(600884, 8)

In [41]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [42]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [43]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [44]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [45]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [46]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [47]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [48]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.983100, val_loss: 1.974822        



In [49]:
set_lrs(opt, 0.001)

In [50]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.683762, val_loss: 1.686666        



Adding things together may loose information... concatenation is better in those cases...

In [51]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)   #now concatenate instead of adding
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [52]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [53]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [54]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.837089, val_loss: 1.823028        



In [55]:
set_lrs(opt, 1e-4)

In [56]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.733860, val_loss: 1.730605        



### Test model

In [57]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [58]:
get_next('for thos')

'e'

In [59]:
get_next('part of ')

't'

In [60]:
get_next('queens a')

'n'

## 4. RNN with pytorch

In [61]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)    #Pytorch returns all the hidden states in h
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)   #outp[-1] because we only care for the last one

In [62]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [63]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [64]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [65]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [66]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [67]:
fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.862778, val_loss: 1.843346        
epoch:   1, train_loss: 1.676988, val_loss: 1.672065        
epoch:   2, train_loss: 1.588341, val_loss: 1.597062        
epoch:   3, train_loss: 1.540265, val_loss: 1.546221        



In [68]:
set_lrs(opt, 1e-4)

In [69]:
fit(m, md, 2, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.470753, val_loss: 1.509117        
epoch:   1, train_loss: 1.455564, val_loss: 1.504176        



### Test model

In [70]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [71]:
get_next('for thos')

'e'

In [72]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [73]:
get_next_n('for thos', 40)

'for those of the same the same the same the same'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [74]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [75]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [76]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [77]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [78]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [79]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [80]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [81]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [82]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [83]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [84]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [85]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)  #contiguos to avoid a pytorch error
    return F.nll_loss(inp.view(-1,nh), targ)

In [86]:
fit(m, md, 4, opt, nll_loss_seq)

A Jupyter Widget

epoch:   0, train_loss: 2.588916, val_loss: 2.403093        
epoch:   1, train_loss: 2.285038, val_loss: 2.194815        
epoch:   2, train_loss: 2.134854, val_loss: 2.083499        
epoch:   3, train_loss: 2.043043, val_loss: 2.011978        



In [87]:
set_lrs(opt, 1e-4)

In [88]:
fit(m, md, 1, opt, nll_loss_seq)

A Jupyter Widget

epoch:   0, train_loss: 1.990274, val_loss: 1.994731        



### Identity init!

In [89]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [90]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))   #eye is the identity matrix


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [91]:
fit(m, md, 4, opt, nll_loss_seq)

A Jupyter Widget

epoch:   0, train_loss: 2.388373, val_loss: 2.205485       
epoch:   1, train_loss: 2.116079, val_loss: 2.048377        
epoch:   2, train_loss: 2.003238, val_loss: 1.980319        
epoch:   3, train_loss: 1.949075, val_loss: 1.931356       



In [92]:
set_lrs(opt, 1e-3)

In [93]:
fit(m, md, 4, opt, nll_loss_seq)

A Jupyter Widget

epoch:   0, train_loss: 1.853807, val_loss: 1.867541        
epoch:   1, train_loss: 1.840845, val_loss: 1.860377        
epoch:   2, train_loss: 1.836762, val_loss: 1.853967        
epoch:   3, train_loss: 1.826497, val_loss: 1.848431        



## Stateful model

## 6.1 Setup
JH: Lets just put the data in the format that torchtext expects it to be.
The first 80% of the rows is training, the last 20% is validation test.
In practice, it is better to have a more reealistic validation of the model by having a selected piece( eg the last 20%) vs a randon sampling.

In [94]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

nietzsche.txt  [0m[01;34mtrn[0m/  [01;34mval[0m/


In [95]:
%ls {PATH}trn

trn.txt


In [None]:
Create a torchtext Field which provides the right thing. By using list we get the character by character.


In [123]:
TEXT = data.Field(lower=True, tokenize=list)

Each mini batch will contain a list of characters.
`n_fac` is the size of the embedding.
Then create the `FILES` dictionary.
Follow it by creating a language model
`min_freq` is likely redundant now
torchtext did something 'smart'.. they can't shuffle the data because it must be continuos....
But we can do a little randomizing bptt a little bit every time.
5% of the time it will be cut in half, and make it slightly bigger or smaller to (eg bptt=8) on average 
This is to create some randomization

In [131]:

bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

print("Length of data loader:", len(md.trn_dl), "number of tokens:",  md.nt, "Number of texts:", len(md.trn_ds), 
      "Length of the text:", len(md.trn_ds[0].text))

Length of data loader: 942 number of tokens: 55 Number of texts: 1 Length of the text: 482908


Notice that after getting the model, TEXT also contains now the .vocab
which has lots of interesting items, e.g. vocab.freqs, vocab.itos, vocab.load_vectors
For example TEXT.vocab.freqs gives the frequency of every character.

In [132]:
#TEXT.vocab.itos

## 6.2 RNN

`repackage_var()` forget the history of this variable, it is backprop thru time
after the for loop, throw away the history of operations and just save the state

In [97]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [133]:
import torch
print(torch.__version__)

0.3.0.post4


In [98]:
m = CharSeqStatefulRnimport torch
print(torch.__version__)n(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [99]:
fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.887392, val_loss: 1.858555         
epoch:   1, train_loss: 1.715481, val_loss: 1.702934         
epoch:   2, train_loss: 1.639839, val_loss: 1.648319         
epoch:   3, train_loss: 1.589087, val_loss: 1.601511         



In [100]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.499632, val_loss: 1.558928         
epoch:   1, train_loss: 1.504284, val_loss: 1.557328         
epoch:   2, train_loss: 1.502765, val_loss: 1.547309         
epoch:   3, train_loss: 1.498325, val_loss: 1.546128         



### 6.3 RNN loop

In [101]:
# From the pytorch source.. but nobody uses it because of gradient explosions...
#so we replace it with the following, a GRU cell

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [102]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [103]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [104]:
fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.906457, val_loss: 1.866394         
epoch:   1, train_loss: 1.722901, val_loss: 1.712402         
epoch:   2, train_loss: 1.639810, val_loss: 1.644468         
epoch:   3, train_loss: 1.588668, val_loss: 1.604960         



### GRU

In [105]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [106]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [107]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [108]:
fit(m, md, 6, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.768728, val_loss: 1.744324         
epoch:   1, train_loss: 1.584175, val_loss: 1.591795         
epoch:   2, train_loss: 1.502281, val_loss: 1.531339         
epoch:   3, train_loss: 1.456857, val_loss: 1.504328         
epoch:   4, train_loss: 1.406893, val_loss: 1.473464         
epoch:   5, train_loss: 1.378161, val_loss: 1.470457         



In [109]:
set_lrs(opt, 1e-4)

In [110]:
fit(m, md, 3, opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.298640, val_loss: 1.428991         
epoch:   1, train_loss: 1.306565, val_loss: 1.427103         
epoch:   2, train_loss: 1.303479, val_loss: 1.421371         



### 6.5 Putting it all together: LSTM

In [111]:
from fastai import sgdr

n_hidden=512  #doubled the size of the hidden layer, because added 0.5 dropout.

In [112]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [None]:
`layerOptimizer' is a fastai class, last parameter is weight decay. It is for differential

In [113]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)    

In [None]:
lo.opt #  after this lo.opt is the optimizer

In [114]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [115]:
fit(m, md, 2, lo.opt, F.nll_loss)

A Jupyter Widget

epoch:   0, train_loss: 1.849966, val_loss: 1.756922         
epoch:   1, train_loss: 1.727244, val_loss: 1.649027         



updating the learning rate with a cosine annealing for the optimizer

In [116]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')  #define a callback to save the model

cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

A Jupyter Widget

epoch:   0, train_loss: 1.557380, val_loss: 1.485322         
epoch:   1, train_loss: 1.595018, val_loss: 1.519544         
epoch:   2, train_loss: 1.472433, val_loss: 1.430348         
epoch:   3, train_loss: 1.614906, val_loss: 1.536453         
epoch:   4, train_loss: 1.543604, val_loss: 1.484156         
epoch:   5, train_loss: 1.453893, val_loss: 1.416796         
epoch:   6, train_loss: 1.397511, val_loss: 1.383618         
epoch:   7, train_loss: 1.593104, val_loss: 1.521154         
epoch:   8, train_loss: 1.570064, val_loss: 1.498114         
epoch:   9, train_loss: 1.525592, val_loss: 1.479909         
epoch:  10, train_loss: 1.492517, val_loss: 1.445707         
epoch:  11, train_loss: 1.443015, val_loss: 1.414220         
epoch:  12, train_loss: 1.397706, val_loss: 1.383487         
epoch:  13, train_loss: 1.352059, val_loss: 1.360667         
epoch:  14, train_loss: 1.318361, val_loss: 1.344329         



In [117]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

A Jupyter Widget

epoch:   0, train_loss: 1.311759, val_loss: 1.340474         
epoch:   1, train_loss: 1.315544, val_loss: 1.340099         
epoch:   2, train_loss: 1.308893, val_loss: 1.340897         
epoch:   3, train_loss: 1.304848, val_loss: 1.342674         
epoch:   4, train_loss: 1.300481, val_loss: 1.337599         
epoch:   5, train_loss: 1.298000, val_loss: 1.336658         
epoch:   6, train_loss: 1.294822, val_loss: 1.332269         
epoch:   7, train_loss: 1.289802, val_loss: 1.333266         
epoch:   8, train_loss: 1.291578, val_loss: 1.327400         
epoch:   9, train_loss: 1.283353, val_loss: 1.332305         
epoch:  10, train_loss: 1.275799, val_loss: 1.323329         
epoch:  11, train_loss: 1.274466, val_loss: 1.325439         
epoch:  12, train_loss: 1.262642, val_loss: 1.319677         
epoch:  13, train_loss: 1.263883, val_loss: 1.321480         
epoch:  14, train_loss: 1.263231, val_loss: 1.319603         
epoch:  15, train_loss: 1.266542, val_loss: 1.325303         
epoch:  

### 6.6 Test

In [118]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [119]:
get_next('for thos')

'e'

In [120]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [121]:
print(get_next_n('for thos', 400))

for those loft known to the extent;with the corties and above him. ly were!..237.[] idness? everythings, which knows; the faith as it always, groeth, spice of fuit--he wish andnolyself-politics--seem proaching man, amultime. he does by means what it connarreign advocates of prescribed, heart--(         2.    the se"in external.""140. the learning,above all it.30. the complex--in how la"!a2, thedue and he 
