In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

In [None]:
%system PATH



## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [1]:
PATH='..\..\data/cervantes/'

In [2]:
#get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}donquijote.txt',encoding='UTF8').read()
print('corpus length:', len(text))

corpus length: 2065877


In [3]:
2065877-400000


1665877

In [4]:
text_train=text[:1600000]
text_test=text[1600000:]
text_simple=text[:160000]

In [5]:
len(text_simple)

160000

In [6]:
chars = sorted(list(set(text_simple)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 75


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [7]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.:;?ABCDEFGHIJLMNOPQRSTUVXYZabcdefghijlmnopqrstuvxyz¡«»¿ÁÉá'

Map from chars to indices and back again

In [8]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [9]:
indices_char

{0: '\x00',
 1: '\n',
 2: ' ',
 3: '!',
 4: '"',
 5: "'",
 6: '(',
 7: ')',
 8: ',',
 9: '-',
 10: '.',
 11: ':',
 12: ';',
 13: '?',
 14: 'A',
 15: 'B',
 16: 'C',
 17: 'D',
 18: 'E',
 19: 'F',
 20: 'G',
 21: 'H',
 22: 'I',
 23: 'J',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'X',
 36: 'Y',
 37: 'Z',
 38: 'a',
 39: 'b',
 40: 'c',
 41: 'd',
 42: 'e',
 43: 'f',
 44: 'g',
 45: 'h',
 46: 'i',
 47: 'j',
 48: 'l',
 49: 'm',
 50: 'n',
 51: 'o',
 52: 'p',
 53: 'q',
 54: 'r',
 55: 's',
 56: 't',
 57: 'u',
 58: 'v',
 59: 'x',
 60: 'y',
 61: 'z',
 62: '¡',
 63: '«',
 64: '»',
 65: '¿',
 66: 'Á',
 67: 'É',
 68: 'á',
 69: 'é',
 70: 'í',
 71: 'ñ',
 72: 'ó',
 73: 'ú',
 74: 'ü'}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [10]:
idx = [char_indices[c] for c in text_simple]

idx[:10]

[2, 28, 54, 46, 49, 42, 54, 38, 2, 52]

In [11]:
''.join(indices_char[i] for i in idx[:70])

' Primera parte del ingenioso hidalgo don Quijote de la Mancha   Capítu'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

c1_data = a los caracteres 1,4,7,10 ..... (empieza en 1 y va de 3 en 3)
<br>
c2_data = a los caracteres en posición 2,5,8 ....
<br>
c3_data = a las posiciones 3,6,9 ...
<br>
c4_data = a las posiciones 4,7,10...

In [12]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

In [13]:
c4_dat

[46,
 54,
 52,
 56,
 41,
 2,
 44,
 46,
 51,
 46,
 48,
 2,
 50,
 57,
 51,
 2,
 2,
 2,
 50,
 38,
 2,
 52,
 57,
 2,
 46,
 54,
 2,
 42,
 54,
 38,
 42,
 38,
 51,
 46,
 72,
 60,
 47,
 40,
 46,
 41,
 2,
 49,
 51,
 46,
 48,
 1,
 50,
 57,
 51,
 2,
 2,
 2,
 50,
 38,
 18,
 57,
 48,
 38,
 41,
 48,
 25,
 40,
 8,
 42,
 57,
 2,
 49,
 42,
 51,
 57,
 54,
 38,
 54,
 54,
 8,
 51,
 38,
 57,
 51,
 46,
 52,
 53,
 2,
 58,
 2,
 2,
 41,
 44,
 41,
 48,
 2,
 2,
 50,
 2,
 2,
 56,
 48,
 51,
 38,
 54,
 2,
 56,
 57,
 1,
 40,
 2,
 38,
 2,
 44,
 44,
 40,
 54,
 51,
 2,
 38,
 48,
 2,
 2,
 44,
 49,
 2,
 40,
 53,
 2,
 54,
 54,
 1,
 48,
 40,
 2,
 55,
 68,
 50,
 45,
 8,
 57,
 51,
 60,
 57,
 54,
 56,
 2,
 55,
 68,
 41,
 8,
 38,
 42,
 55,
 51,
 58,
 54,
 55,
 38,
 73,
 52,
 51,
 50,
 41,
 38,
 41,
 57,
 2,
 55,
 51,
 50,
 55,
 40,
 55,
 70,
 2,
 55,
 54,
 1,
 54,
 55,
 42,
 57,
 38,
 42,
 38,
 18,
 54,
 56,
 41,
 48,
 40,
 40,
 70,
 2,
 60,
 41,
 58,
 38,
 42,
 40,
 61,
 2,
 1,
 48,
 41,
 52,
 38,
 38,
 43,
 55,
 55,
 40,
 2,

In [14]:
c2_dat

[28,
 49,
 38,
 38,
 42,
 42,
 46,
 42,
 51,
 2,
 41,
 44,
 41,
 2,
 46,
 56,
 41,
 48,
 25,
 40,
 2,
 16,
 70,
 48,
 52,
 49,
 51,
 29,
 2,
 38,
 2,
 2,
 2,
 50,
 40,
 50,
 2,
 42,
 46,
 51,
 42,
 43,
 51,
 2,
 41,
 44,
 41,
 2,
 46,
 56,
 41,
 48,
 25,
 40,
 2,
 50,
 50,
 57,
 54,
 42,
 38,
 38,
 45,
 2,
 2,
 60,
 50,
 39,
 2,
 2,
 46,
 51,
 40,
 41,
 49,
 2,
 2,
 2,
 40,
 1,
 42,
 51,
 57,
 58,
 70,
 57,
 45,
 38,
 51,
 42,
 51,
 41,
 48,
 61,
 42,
 38,
 46,
 42,
 8,
 41,
 44,
 38,
 46,
 38,
 54,
 70,
 43,
 40,
 60,
 38,
 51,
 51,
 42,
 54,
 33,
 2,
 48,
 41,
 38,
 51,
 68,
 58,
 38,
 57,
 40,
 50,
 51,
 55,
 52,
 72,
 48,
 2,
 55,
 51,
 42,
 2,
 42,
 55,
 2,
 42,
 38,
 51,
 48,
 2,
 39,
 51,
 2,
 50,
 47,
 2,
 55,
 46,
 50,
 8,
 48,
 50,
 38,
 49,
 51,
 42,
 71,
 46,
 54,
 48,
 2,
 49,
 44,
 8,
 51,
 57,
 38,
 48,
 2,
 42,
 52,
 56,
 2,
 2,
 2,
 40,
 50,
 10,
 48,
 42,
 51,
 42,
 38,
 51,
 48,
 38,
 55,
 51,
 42,
 42,
 54,
 8,
 38,
 38,
 41,
 58,
 48,
 51,
 38,
 2,
 55,
 46,
 56,
 

Our inputs

In [15]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

In [16]:
x1

array([ 2, 46, 54, ...,  2, 55, 42])

Our output

In [17]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [18]:
x1[:4], x2[:4], x3[:4]

(array([ 2, 46, 54, 52]), array([28, 49, 38, 38]), array([54, 42,  2, 54]))

In [19]:
y[:4]

array([46, 54, 52, 56])

In [20]:
x1.shape, y.shape

((53333,), (53333,))

### Create and train model

Pick a size for our hidden state

In [21]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [22]:
n_fac = 42

In [26]:
m = nn.Linear(20, 30)
input = torch.randn(128, 20)
output = m(input)
print(output.size())


NameError: name 'nn' is not defined

In [69]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()))
        h = torch.tanh(self.l_hidden(h+in1))
        h = torch.tanh(self.l_hidden(h+in2))
        h = torch.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

#### Modelo LSTM propio

In [138]:
class Char3LSTMModel(nn.Module):
    def __init__(self,vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        
        # Matriz de pesos para input gate
        self.l_xi = nn.Linear(n_hidden, n_hidden)
        
        
        # Matriz de pesos para forget gate f(t)
        self.l_xf = nn.Linear(n_hidden, n_hidden)
        
        
        # Matriz dea  gate g(t)
        self.l_xg = nn.Linear(n_hidden, n_hidden)
        
        
        # Matriz de pesos para output gate i(t)
        self.l_xo = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        
        
    def forward(self,*cs):
        #Embeddings
        
        #in1 = F.relu(self.l_in(self.e(c1)))
        #in2 = F.relu(self.l_in(self.e(c2)))
        #in3 = F.relu(self.l_in(self.e(c3)))
        #pdb.set_trace()
        
        bs = cs[0].size()[0]
        self.h = V(torch.zeros([bs,n_hidden]))
        self.c = V(torch.ones([bs,n_hidden]))
         
        for car in cs:
            
            inc = F.relu(self.l_in(self.e(car)))
            self.h,self.c = self.cellLSTM(inc,self.h,self.c)
            
          
        return F.log_softmax(self.l_out(self.h))
    
    def cellLSTM(self,c_emb,h,c):
        it = F.logsigmoid(self.l_xi(c_emb+self.h))
        ft = F.logsigmoid(self.l_xf(c_emb+self.h))
        gt = torch.tanh(self.l_xg(c_emb+self.h))
        c = torch.mul(ft,self.c) + torch.mul(it,gt)
        self.h =torch.mul(F.logsigmoid(self.l_xo(c_emb+self.h)),torch.tanh(self.c))
        return self.h,self.c
    

In [139]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [140]:
a=np.stack([x1,x2,x3])

In [141]:
len(a[0])

53333

In [142]:
#m = Char3Model(vocab_size, n_fac)
m=Char3LSTMModel(vocab_size, n_fac)

In [143]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [144]:
xs[0].size(0)

512

In [145]:
opt = optim.Adam(m.parameters(), 1e-3)

In [146]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                 
    0      2.437601   2.199118  



[2.199118137359619]

In [117]:
set_lrs(opt, 0.001)


In [150]:
m(T([[41,1,3]]))

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0.

In [118]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                 
    0      2.152131   1.01025   



[1.0102496147155762]

### Test model

In [130]:
def get_next(inp):
    pdb.set_trace()
    idxs = T(np.array([char_indices[c] for c in inp]))
    #pdb.set_trace()
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    #i = np.argmax(p.view(-1).detach().numpy())
    return chars[i]

In [131]:
get_next('de ')

> <ipython-input-130-225367365578>(3)get_next()
-> idxs = T(np.array([char_indices[c] for c in inp]))
(Pdb) n
> <ipython-input-130-225367365578>(5)get_next()
-> p = m(*VV(idxs))
(Pdb) idxs
tensor([41, 42,  2])
(Pdb) V(idxs)
tensor([41, 42,  2])
(Pdb) VV(idxs)
tensor([41, 42,  2])
(Pdb) *VV(idxs)
*** SyntaxError: can't use starred expression here
(Pdb) c


IndexError: tuple index out of range

In [None]:
get_next('err')

In [None]:
get_next(' de')

In [None]:
get_next('and')

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [None]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [None]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

In [None]:
c_in_dat

Then create a list of the next character in each of these series. This will be the labels for our model.

In [None]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [None]:
c_out_dat

In [None]:
xs = np.stack(c_in_dat, axis=0)

In [None]:
xs

In [None]:
xs.shape

In [None]:
y = np.stack(c_out_dat)

In [None]:
y

So each column below is one series of 8 characters from the text.

In [None]:
xs[:cs,:cs]

...and this is the next character after each sequence.

In [None]:
y[:cs]

### Create and train model

In [None]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [None]:
val_idx

In [None]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [None]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden))
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [None]:
m = CharLoopModel(vocab_size, n_fac)
opt = optim.Adam(m.parameters(), 1e-2)

In [None]:
fit(m, md, 1, opt, F.nll_loss)

In [None]:
set_lrs(opt, 0.001)

In [None]:
fit(m, md, 1, opt, F.nll_loss)

In [None]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden))
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = torch.tanh(self.l_hidden(inp))#   F.tanh(self.l_hidden(inp))
            #h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [None]:
m = CharLoopConcatModel(vocab_size, n_fac)
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [None]:
fit(m, md, 1, opt, F.nll_loss)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 1, opt, F.nll_loss)

### Test model

In [None]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [None]:
get_next('en un lu')

In [None]:
get_next('part of ')

In [None]:
get_next('queens a')

## RNN with pytorch

In [None]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [None]:
m = CharRnn(vocab_size, n_fac)
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [None]:
t = m.e(V(torch.stack(xs)))
t.size()

In [None]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

In [None]:
t = m(*V(xs)); t.size()

In [None]:
fit(m, md, 4, opt, F.nll_loss)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 2, opt, F.nll_loss)

### Test model

In [None]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [None]:
get_next('for thos')

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
get_next_n('for thos', 40)

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [None]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [None]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [None]:
xs = np.stack(c_in_dat)
xs.shape

In [None]:
ys = np.stack(c_out_dat)
ys.shape

In [None]:
xs[:cs,:cs]

In [None]:
ys[:cs,:cs]

### Create and train model

In [None]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [None]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [None]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [None]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [None]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [None]:
fit(m, md, 4, opt, nll_loss_seq)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 1, opt, nll_loss_seq)

### Identity init!

In [None]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [None]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))

In [None]:
fit(m, md, 4, opt, nll_loss_seq)

In [None]:
set_lrs(opt, 1e-3)

In [None]:
fit(m, md, 4, opt, nll_loss_seq)

## Stateful model

### Setup

In [None]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

# Note: The student needs to practice her shell skills and prepare her own dataset before proceeding:
# - trn/trn.txt (first 80% of nietzsche.txt)
# - val/val.txt (last 20% of nietzsche.txt)

%ls {PATH}

In [None]:
%ls {PATH}trn

In [None]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

### RNN

In [None]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [None]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
fit(m, md, 4, opt, F.nll_loss)

In [None]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

### RNN loop

In [None]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [None]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [None]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
fit(m, md, 4, opt, F.nll_loss)

### GRU

In [None]:
??nn.LSTMCell

In [None]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [None]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [None]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
fit(m, md, 6, opt, F.nll_loss)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 3, opt, F.nll_loss)

### Putting it all together: LSTM

In [None]:
from fastai import sgdr

n_hidden=512

In [None]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [None]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [None]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [None]:
fit(m, md, 2, lo.opt, F.nll_loss)

In [None]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

In [None]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

### Test

In [None]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [None]:
get_next('for thos')

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
print(get_next_n('for thos', 400))