In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *
import numpy as np

In [2]:
PATH = "data/"

In [3]:
text = open(f'{PATH}folketinget/20091.txt').read()

Truncate the text when just testing stuff.

In [4]:
text = text[:1000000]

In [5]:
print(len(text))

1000000


In [6]:
text[:400]

'Mødet er åbnet.I dag er der følgende anmeldelser: Thor Pedersen (V), Mogens Lykketoft (S), Søren Espersen (DF), Holger K. Nielsen (SF) og Helge Adam Møller (KF):Lovforslag nr. L 53 (Forslag til lov om ændring af valg til Folketinget. (Ændring af reglerne om boliger til folketingsmedlemmer m.v.)).Ministeren for sundhed og forebyggelse (Jakob Axel Nielsen): Lovforslag nr. L 54 (Forslag til lov om æn'

In [49]:
chars = sorted(list(set(text)))

In [50]:
chars.insert(0, "\0")

In [51]:
''.join(chars)

"\x00\t\n !'(),-./0123456789:;?ABCDEFGHIJKLMNOPRSTUVWY[]abcdefghijklmnopqrstuvwxyz§«²»½ÅÆØåæéø–’…"

In [52]:
vocab_size = len(chars)
vocab_size

91

Mapping between chars and indices, and reverse.

In [55]:
char_indices = { c: i for i, c in enumerate(chars)}

In [56]:
indices_char = { i: c for i, c in enumerate(chars)}

In [57]:
char_indices['A']

25

In [58]:
[char_indices[c] for c in text[:10]]

[37, 87, 53, 54, 69, 3, 54, 67, 3, 84]

In [59]:
[indices_char[i] for i in [30, 31, 32]]

['F', 'G', 'H']

Let's use indices as data set.

In [60]:
idx = [ char_indices[c] for c in text]

In [61]:
idx[:10]

[37, 87, 53, 54, 69, 3, 54, 67, 3, 84]

In [62]:
[indices_char[i] for i in idx[:14]]

['M', 'ø', 'd', 'e', 't', ' ', 'e', 'r', ' ', 'å', 'b', 'n', 'e', 't']

In [63]:
cs = 3
c1_dat = [idx[i    ] for i in range(0, len(idx) - cs, cs)]
c2_dat = [idx[i + 1] for i in range(0, len(idx) - cs, cs)]
c3_dat = [idx[i + 2] for i in range(0, len(idx) - cs, cs)]
c4_dat = [idx[i + 3] for i in range(0, len(idx) - cs, cs)]

The first characters are:

In [64]:
[indices_char[c] for c in c1_dat[:4]]

['M', 'e', 'e', 'å']

In [65]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

In [66]:
y = np.stack(c4_dat)

In [67]:
x1[:4], x2[:4], x3[:4]

(array([37, 54, 54, 84]), array([87, 69, 67, 51]), array([53,  3,  3, 63]))

In [68]:
x1.shape, y.shape

((333333,), (333333,))

# Create and train model

In [25]:
n_hidden = 256

In [26]:
n_fac = 42

In [27]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size())).cuda()
        h = F.tanh(self.l_hidden(h + in1))
        h = F.tanh(self.l_hidden(h + in2))
        h = F.tanh(self.l_hidden(h + in3))
        
        return F.log_softmax(self.l_out(h))

In [28]:
Char3Model(vocab_size, n_fac)

Char3Model(
  (e): Embedding(91, 42)
  (l_in): Linear(in_features=42, out_features=256, bias=True)
  (l_hidden): Linear(in_features=256, out_features=256, bias=True)
  (l_out): Linear(in_features=256, out_features=91, bias=True)
)

In [29]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=512)

In [30]:
m = Char3Model(vocab_size, n_fac).cuda()

In [31]:
it = iter(md.trn_dl)
*xs, yt = next(it)
len(xs)

3

In [32]:
yt


 54
 68
 61
 54
 58
 10
 69
 55
 71
 53
 68
 69
 54
 69
 61
 58
 68
 63
 54
 68
 54
  3
 69
 59
 54
 65
 56
 67
 60
 61
 63
 54
 10
 67
 62
 68
  3
 85
  3
 55
 54
  3
 54
 63
 54
 60
 63
 85
 67
 63
 53
  3
 68
 56
 69
 62
 57
 53
 63
 64
 85
 63
 58
 50
 54
  3
 61
 70
  3
  3
 61
 54
  8
  3
 54
 63
 63
 71
 63
 56
 53
 64
 50
 53
 55
 58
 58
 54
 63
 56
 69
 63
 67
 55
  3
  3
 70
  3
  3
 67
 28
 58
 74
 53
 54
 10
  3
  3
 55
 54
 53
  3
 50
  8
 50
  3
  3
  3
 63
 61
 54
 50
 58
 54
 87
 59
 56
 61
 54
 53
 63
 50
 50
 54
  3
  3
 54
 69
 58
 84
 69
 87
 51
 58
 28
  3
  3
 60
 54
 63
 63
 68
 53
 69
 64
 62
 61
 56
 69
 56
 54
  3
 71
 54
 50
 55
 54
 87
 59
 50
 84
 61
 53
 54
  3
  3
  3
 57
 62
 68
  3
 67
 54
  3
  3
 58
 67
 64
 60
 50
  3
 69
 62
 61
 68
  3
 10
 67
 55
 10
 58
 58
  3
 54
  3
 28
 54
 71
  3
 50
 50
 63
 51
 58
 71
 58
 58
 55
 53
  3
  3
  3
 71
 58
 54
 50
 64
 54
  3
 64
  3
  3
 87
  3
 55
  3
 69
  3
  3
  3
 55
 61
 56
 85
 68
 54
 58
 50
 53
 58

In [33]:
t = m(*V(xs))

In [34]:
opt = optim.Adam(m.parameters(), 1e-2)

In [35]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.843705   0.425638  



[array([0.42564])]

In [36]:
set_lrs(opt, 1e-3)

In [37]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.615591   0.222878  



[array([0.22288])]

In [38]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    idx = np.argmax(to_np(p))
    return indices_char[idx]

In [39]:
get_next('Fol')

'k'

In [40]:
get_next('Soc')

'i'

In [41]:
get_next('Min')

'i'

In [42]:
def get_more(inp, length):
    next = inp
    while len(next) < length:
        next += get_next(next[-3:])
    return next     
        

In [72]:
get_more('skat', 50)

'skatte er det er det er det er det er det er det e'

In [100]:
class Char3Model2(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.h = V(torch.zeros(n_hidden)).cuda()
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        self.h = F.tanh(self.l_hidden(self.h + in1))
        self.h = F.tanh(self.l_hidden(self.h + in2))
        self.h = F.tanh(self.l_hidden(self.h + in3))
        
        return F.log_softmax(self.l_out(self.h))
        

In [106]:
m = Char3Model2(vocab_size, n_fac).cuda()

In [107]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  0%|          | 0/652 [00:00<?, ?it/s]


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.