In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='data/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [5]:
chars.insert(0, "\0")

In [6]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [7]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

*idx* will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)

In [8]:
idx = [char_indices[c] for c in text]

In [9]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [11]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]

Our inputs

In [12]:
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

Our output

In [13]:
y = np.stack(c4_dat[:-2])

The first 4 inputs and outputs

In [14]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [15]:
y[:4]

array([30, 29,  1, 40])

In [16]:
x1.shape, y.shape

((200295,), (200295,))

The number of latent factors to create (i.e. the size of the embedding matrix)

In [17]:
n_fac = 42

Create inputs and embedding outputs for each of our 3 character inputs

### Create and train model

Pick a size for our hidden state

In [18]:
n_hidden = 256

In [19]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [20]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [21]:
m = Char3Model(vocab_size, n_fac).cuda()

In [22]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [191]:
opt = optim.Adam(m.parameters(), 1e-2)

In [192]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       2.09627  6.52849]                                 



In [193]:
set_lrs(opt, 0.001)

In [194]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.84525  6.52312]                                 



### Test model

In [195]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [196]:
get_next('y. ')

'T'

In [197]:
get_next('ppl')

'e'

In [198]:
get_next(' th')

'e'

In [199]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [23]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model.

In [24]:
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
            for n in range(cs)]

In [25]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs-1)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [26]:
c_out_dat = [idx[i+cs] for i in range(0, len(idx)-1-cs, cs)]

In [27]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs-1)]

In [28]:
xs = np.stack(c_in_dat, axis=1)

In [29]:
xs.shape

(8, 600884)

In [30]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [31]:
[xs[n][:cs] for n in range(cs)]

[array([40, 42, 29, 30, 25, 27, 29,  1]),
 array([42, 29, 30, 25, 27, 29,  1,  1]),
 array([29, 30, 25, 27, 29,  1,  1,  1]),
 array([30, 25, 27, 29,  1,  1,  1, 43]),
 array([25, 27, 29,  1,  1,  1, 43, 45]),
 array([27, 29,  1,  1,  1, 43, 45, 40]),
 array([29,  1,  1,  1, 43, 45, 40, 40]),
 array([ 1,  1,  1, 43, 45, 40, 40, 39])]

...and this is the next character after each sequence.

In [32]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [33]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [34]:
md = ColumnarModelData.from_arrays('.', val_idx, np.stack(xs, axis=1), y, bs=512)

In [35]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h))

In [36]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [284]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       2.00491  1.97759]                                 



In [285]:
set_lrs(opt, 0.001)

In [286]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.71448  1.71059]                                 



In [37]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h))

In [38]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [39]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [40]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.83095  1.80892]                                 



In [44]:
set_lrs(opt, 1e-4)

In [45]:
fit(m, md, 1, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.55442  1.56399]                                 



### Test model

In [46]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [47]:
get_next('for thos')

'e'

In [48]:
get_next('part of ')

't'

In [49]:
get_next('queens a')

'n'

## RNN with pytorch

In [95]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]))

In [104]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [83]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [84]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [85]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [91]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [105]:
fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.87536  1.86075]                                 
[ 1.      1.6791  1.6703]                                    
[ 2.       1.59159  1.59625]                                 
[ 3.       1.53684  1.54857]                                 



In [106]:
set_lrs(opt, 1e-4)

In [107]:
fit(m, md, 2, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.47029  1.50936]                                 
[ 1.       1.45206  1.50379]                                 



### Test model

In [108]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [109]:
get_next('for thos')

'e'

In [113]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [118]:
get_next_n('for thos', 40)

'for those the same the strength the strength the'