In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='data/nietzsche/'

In [366]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

nietzsche.txt: 606KB [00:00, 649KB/s]                             

corpus length: 600893





                                                           

In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [6]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [7]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

*idx* will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)

In [8]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [9]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [10]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

Our inputs

In [11]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [12]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [13]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [14]:
y[:4]

array([30, 29,  1, 40])

In [15]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [16]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [17]:
n_fac = 42

In [18]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.input = nn.Linear(n_fac, n_hidden)
        self.hidden = nn.Linear(n_hidden, n_hidden)
        self.output = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        e1 = self.e(c1)
        a1 = F.relu(self.input(e1))
        h1 = F.tanh(self.hidden(a1))
        
        e2 = self.e(c2)
        a2 = F.relu(self.input(e2))
        h2 = F.tanh(self.hidden(h1+a2))
        
        e3 = self.e(c3)
        a3 = F.relu(self.input(e3))
        h3 = F.tanh(self.hidden(h2+a3))
        
        o1 = self.output(h3)
        
        return F.log_softmax(o1)

In [19]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [20]:
m = Char3Model(vocab_size, n_fac).cuda()

In [21]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [22]:
opt = optim.Adam(m.parameters(), 1e-2)

In [23]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.089592   0.355989  



[0.35598898]

In [None]:
??fit

In [None]:
set_lrs(opt, 0.001)

In [None]:
fit(m, md, 1, opt, F.nll_loss)

### Test model

In [None]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [None]:
get_next('y. ')

In [None]:
get_next('ppl')

In [None]:
get_next(' th')

In [None]:
get_next('and')

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [308]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [25]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [26]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [27]:
xs = np.stack(c_in_dat, axis=0)

In [28]:
xs.shape

(600885, 8)

In [29]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [30]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [31]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [32]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [37]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [40]:
first_batch = next(iter(md.trn_dl))

In [42]:
len(first_batch)

9

In [58]:
first_batch[0].size(0)

512

In [34]:

e = torch.nn.Embedding(vocab_size, n_fac)
inp = torch.nn.Linear(n_fac, n_hidden)
hidden = torch.nn.Linear(n_hidden, n_hidden)
output = torch.nn.Linear(n_hidden, cs)


# bs = cs[0].size(0)
# h = V(torch.zeros(bs, n_hidden).cuda())
# #         h = torch.autograd.Variable(torch.zeros((n_hidden))).cuda()
# for c in cs:
#     e1 = self.e(c)
#     in1 = F.relu(self.input(e1))
#     h = F.tanh(self.hidden(h+in1))
# o = self.output(h)
# res = F.log_softmax(o, dim=-1)

In [73]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = torch.nn.Embedding(vocab_size, n_fac)
        self.input = torch.nn.Linear(n_fac, n_hidden)
        self.hidden = torch.nn.Linear(n_hidden, n_hidden)
        self.output = torch.nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
#         h = torch.autograd.Variable(torch.zeros((n_hidden))).cuda()
        for c in cs:
            e1 = self.e(c)
            in1 = F.relu(self.input(e1))
            h = F.tanh(self.hidden(h+in1))
        o = self.output(h)
        return F.log_softmax(o, dim=-1)

In [70]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [71]:
fit(m, md, 1, opt, F.nll_loss)

  0%|          | 0/939 [00:00<?, ?it/s]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  0%|          | 1/939 [00:00<06:06,  2.56it/s, loss=4.5]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  0%|          | 1/939 [00:00<06:27,  2.42it/s, loss=4.07]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  0%|          | 1/939 [00:00<06:37,  2.36it/s, loss=3.74]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  0%|          | 1/939 [00:00<06:59,  2.24it/s, loss

  4%|▎         | 33/939 [00:01<00:54, 16.71it/s, loss=2.8] torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  4%|▎         | 33/939 [00:02<00:55, 16.37it/s, loss=2.8]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  4%|▍         | 36/939 [00:02<00:51, 17.58it/s, loss=2.79]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  4%|▍         | 36/939 [00:02<00:51, 17.48it/s, loss=2.78]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  4%|▍         | 36/939 [00:0

  7%|▋         | 67/939 [00:03<00:43, 20.01it/s, loss=2.56]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  7%|▋         | 67/939 [00:03<00:43, 19.88it/s, loss=2.56]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  7%|▋         | 67/939 [00:03<00:44, 19.74it/s, loss=2.55]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  7%|▋         | 67/939 [00:03<00:44, 19.67it/s, loss=2.55]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
  8%|▊         | 72/939 [00:

 11%|█         | 99/939 [00:03<00:33, 25.17it/s, loss=2.44]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
 11%|█         | 99/939 [00:03<00:33, 25.00it/s, loss=2.43]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
 11%|█         | 99/939 [00:03<00:33, 24.78it/s, loss=2.43]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
 11%|█         | 105/939 [00:04<00:32, 26.03it/s, loss=2.43]torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
torch.Size([512, 256])
 11%|█         | 105/939 [0

KeyboardInterrupt: 

In [None]:
set_lrs(opt, 0.001)

In [None]:
fit(m, md, 1, opt, F.nll_loss)

In [85]:
torch.cat((torch.ones((10, 2)), torch.ones((10, 2))), dim=1)


    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
    1     1     1     1
[torch.FloatTensor of size 10x4]

### FastAi original: Concat embedding with hidden

In [115]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = torch.nn.Embedding(vocab_size, n_fac)
        self.input = torch.nn.Linear(n_fac+n_hidden, n_hidden)
        self.hidden = torch.nn.Linear(n_hidden, n_hidden)
        self.output = torch.nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = torch.autograd.Variable(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            e1 = self.e(c)
            cat = torch.cat([h, e1], dim=1)
            in1 = F.relu(self.input(cat))
            h = F.tanh(self.hidden(in1))
        o = self.output(h)
        return F.log_softmax(o, dim=-1)

In [116]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [117]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [118]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.822172   1.790493  



[1.7904927]

In [119]:
set_lrs(opt, 1e-4)

In [120]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.704583   1.710664  



[1.7106643]

### My Model: concat hidden with hidden

In [121]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = torch.nn.Embedding(vocab_size, n_fac)
        self.input = torch.nn.Linear(n_fac, n_hidden)
        self.hidden = torch.nn.Linear(n_hidden*2, n_hidden)
        self.output = torch.nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = torch.autograd.Variable(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            e1 = self.e(c)
            in1 = F.relu(self.input(e1))
            cat = torch.cat([h, in1], dim=1)
            h = F.tanh(self.hidden(cat))
        o = self.output(h)
        return F.log_softmax(o, dim=-1)

In [122]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [123]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [124]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.811987   1.796126  



[1.7961255]

In [125]:
set_lrs(opt, 1e-4)

In [126]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.730774   1.731683  



[1.7316834]

### Test model

In [127]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [128]:
get_next('for thos')

'e'

In [129]:
get_next('part of ')

't'

In [130]:
get_next('queens a')

'n'

## RNN with pytorch

#### Andrew playing around with dimensions

In [321]:
teste = nn.Embedding(vocab_size, n_fac)

In [322]:
*testx, testy = next(iter(md.trn_dl))
testx1 = [c.cpu() for c in testx]

In [323]:
testy = torch.autograd.Variable(testy.cpu())

In [324]:
stack1 = torch.stack(testx1)

In [325]:
testinp = teste(torch.autograd.Variable((stack1)))
testinp.size()

torch.Size([8, 512, 42])

In [326]:
testrnn = nn.RNN(n_fac, n_hidden)

In [329]:
bs = testx1[0].size(0)
testh = torch.autograd.Variable(torch.zeros((1, bs, n_hidden)))

In [330]:
outp, h = testrnn(testinp, testh)

In [331]:
outp.size()

torch.Size([8, 512, 256])

In [332]:
h.size()

torch.Size([1, 512, 256])

In [333]:
testout = nn.Linear(n_hidden, vocab_size)

In [334]:
# res = F.log_softmax(testout(outp[-1]))
res = F.log_softmax(testout(outp))

In [335]:
print('Res size:', res.size())
print('Y size:', testy.size())

Res size: torch.Size([8, 512, 85])
Y size: torch.Size([512, 8])


In [336]:
F.cross_entropy(res, testy)

ValueError: Expected 2 or 4 dimensions (got 3)

### This is testing for output model. Y size should be torch.Size([512, 8])

In [290]:
F.nll_loss(res, testy)

Variable containing:
 4.4597
[torch.FloatTensor of size 1]

In [356]:
# sl,bs,nh = res.size() # sequence length (# of timestamps), batch_size, hidden state
sl,bs,vs = res.size() # sequence length (# of timestamps), batch_size, vocab_size
# I believe vc should actually be batch_size

In [357]:
flat_testy = testy.transpose(0,1).contiguous().view(-1); flat_testy.size()

torch.Size([4096])

In [358]:
flat_res = res.view(-1, vs); flat_res.size()

torch.Size([4096, 85])

In [359]:
# def stack_nll_loss(inp, targ):
F.nll_loss(flat_res, flat_testy)

Variable containing:
 2.0824
[torch.FloatTensor of size 1]

#### Actial rnn code

In [300]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = torch.autograd.Variable(torch.zeros((1, bs, n_hidden)).cuda())
        e1 = self.e(torch.stack(cs))
        outp, h = self.rnn(e1, h)
        return F.log_softmax(self.out(outp[-1]))
        

In [301]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [302]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [303]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [304]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [305]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [None]:
fit(m, md, 4, opt, F.nll_loss)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 2, opt, F.nll_loss)

### Test model

In [None]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [None]:
get_next('for thos')

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
get_next_n('for thos', 40)

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [311]:
cs=8

In [312]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [313]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [314]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [315]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [316]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [317]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [318]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [319]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [None]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):

In [None]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [None]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [None]:
fit(m, md, 4, opt, nll_loss_seq)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 1, opt, nll_loss_seq)

### Identity init!

In [None]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [None]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))

In [None]:
fit(m, md, 4, opt, nll_loss_seq)

In [None]:
set_lrs(opt, 1e-3)

In [None]:
fit(m, md, 4, opt, nll_loss_seq)

## Stateful model

### Setup

In [367]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

nietzsche.txt  [0m[01;34mtrn[0m/  [01;34mval[0m/


wc -l nietzsche.txt
head -n 8000 nietzsche.txt > trn/nietzsche.txt
tail -n 1934 nietzsche.txt > val/nietzsche.txt

In [368]:
%ls {PATH}trn

nietzsche.txt


In [369]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(947, 55, 1, 485751)

In [373]:
b1 = next(iter(md.trn_dl))

In [380]:
for b in b1:
    print(b.size())

torch.Size([10, 64])
torch.Size([640])


In [382]:
(len(md.trn_ds[0].text) / bs) / bptt

948.732421875

947 - number_tokens / batch_size / bptt

### RNN

In [370]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [371]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [372]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss                               
    0      1.859614   1.853634  
    1      1.688205   1.70605                                
    2      1.602531   1.630715                               
    3      1.548255   1.600085                               



[1.6000848]

In [None]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

### RNN loop

In [None]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [None]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [None]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
fit(m, md, 4, opt, F.nll_loss)

### GRU

In [None]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [None]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [None]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [None]:
fit(m, md, 6, opt, F.nll_loss)

In [None]:
set_lrs(opt, 1e-4)

In [None]:
fit(m, md, 3, opt, F.nll_loss)

### Putting it all together: LSTM

In [None]:
from fastai import sgdr

n_hidden=512

In [None]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [None]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [None]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [None]:
fit(m, md, 2, lo.opt, F.nll_loss)

In [None]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

In [None]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

### Test

In [None]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [None]:
get_next('for thos')

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
print(get_next_n('for thos', 400))