In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 727kB 2.7MB/s 
[K     |████████████████████████████████| 51kB 5.5MB/s 
[K     |████████████████████████████████| 1.1MB 14.6MB/s 
[K     |████████████████████████████████| 51kB 5.6MB/s 
[K     |████████████████████████████████| 194kB 18.4MB/s 
[K     |████████████████████████████████| 51kB 6.2MB/s 
[K     |████████████████████████████████| 92kB 7.3MB/s 
[K     |████████████████████████████████| 40kB 4.5MB/s 
[K     |████████████████████████████████| 51kB 5.6MB/s 
[K     |████████████████████████████████| 61kB 6.3MB/s 
[K     |████████████████████████████████| 2.6MB 16.8MB/s 
[?25hMounted at /content/gdrive


In [2]:
from fastbook import *


## Training a language model from scratch

Getting the data to quickly prototype the models

In [3]:
# human numbers dataset

from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)

In [4]:
Path.BASE_PATH = path

In [5]:
path.ls()

(#2) [Path('train.txt'),Path('valid.txt')]

In [6]:
lines = L()

with open(path/'train.txt') as f : lines += L(*f.readlines())
with open(path/'valid.txt') as f : lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [7]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [8]:
# lets tokenize

tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [9]:
# lets numericalize

# creating a list of unique tokens
vocab = L(*tokens).unique()


In [10]:
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [11]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

## First language Model from scratch

We are going to predict each word baded on previous three words

Lets attempt with plain python

In [12]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0, len(tokens)-4, 3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

Noe lets do it with tensors of numericalised values which is what the model will actually use

In [13]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0, len(nums) - 4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [14]:
# batching it with dataloaders class

bs= 64
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

In [15]:
for d in dls : print(d)

<fastai.data.core.TfmdDL object at 0x7ffb552d1240>
<fastai.data.core.TfmdDL object at 0x7ffb552d1550>


Now we can create a neural network that takes three words as input and returns a prediction of probability of each possible next word in vocab.

we will use three standard layers with tweaks

first linear layer will only use first word embedding as activations

second will used second word embeddingplus the first layers output

the third will use word embedding plus the second layer output 

each will have same weight matrix, only activation change but layer weights will not change from layer to layer if that makes sense

since layer weights do not change, it ismore like same layer repeated

## Language model in Pytorch

In [16]:
class LMModel1(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz,n_hidden) # for input ot hidden
    self.h_h = nn.Linear(n_hidden, n_hidden) # for hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz) # for hidden to output

  def forward(self,x):
    h =F.relu(self.h_h(self.i_h(x[:,0])))
    h = h + self.i_h(x[:,1])
    h = F.relu(self.h_h(h))
    h = h + self.i_h(x[:,2])
    h = F.relu(self.h_h(h))
    return self.h_o(h)

In [17]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.824297,1.970941,0.467554,00:02
1,1.386973,1.823242,0.467554,00:02
2,1.417556,1.654497,0.494414,00:02
3,1.37644,1.650849,0.494414,00:02


to check if this model is any good,lets find the most common token for target in validation set

In [18]:
n, counts = 0, torch.zeros(len(vocab))
for x, y in dls.valid:
  n += y.shape[0]
  for i in range_of(vocab): counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n

# it might have been . but it seems thousand is used a lot

# this is the baseline model lets try refactor it with a loop

(tensor(29), 'thousand', 0.15165200855716662)

In [19]:
# we can replace the duplicated code with a for loop

class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden,n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
  
  def forward(self, x):
    h = 0 
    for i in range(3):
      h = h+ self.i_h(x[:,i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)

In [20]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func = F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.816274,1.964143,0.460185,00:02
1,1.423805,1.739964,0.473259,00:02
2,1.430327,1.685172,0.485382,00:02
3,1.38839,1.657033,0.470406,00:02


we see that there is a set of activations that are being updated each time through the loop stored in variable h - called the hidden state

A neural network that is defined using a loop is called RNN.

## Improving the RNN

looking at the code for our RNN one thing that seem problemating is that we are initialising them to 0

But if we order the samples correctly, those samplesequence will be read in order by the model exposing the model to ong stretches of the original sequence

another thing to look at is having more signal: which just predict the fourth word when we can also predict the second and third words

## Maintaining the state of RNN

we can remove initialising to zero and simply create another function for init

this will create another problem our neural network then will be as deep as the document. 

for example if there were 10,000 tokens in our dataset we would be creating a 10,000 layer neural network

this is because if we dont initialise all the loops stack on top of each other

the problem with 10,000 layers is that when you get to 10,000th word you stillhave to calculate the derivatives all the way to first layer

It is unlikely that you will be ableto store even one minibatch of GPU

The solution of this is to tell PyTorch that we do not want to back propagate the entire neural network, instead just keep last three layers of gradients.

we use `detach` to achieve this


In [21]:
class LMModel3(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz,n_hidden)
    self.h_h = nn.Linear(n_hidden,n_hidden)
    self.h_o = nn.Linear(n_hidden,vocab_sz)
    self.h = 0

  def forward(self, x):
    for i in range(3):
      self.h = self.h + self.i_h(x[:,i])
      self.h = F.relu(self.h_h(self.h))
    out = self.h_o(self.h)
    self.h = self.h.detach()
    return out
  
  def reset(self):
    self.h = 0

    # this model will have the same activations whatever the sequence length we pick because the hidden state will remmeber the last activation from previous batch
    # because of the hidden state only thing that willbe different is that gradient computed at each step


this approach is called BPTT: back propagationthrough time.


earlier we were using LMDataloaders to ensure that samples are goingone after the other this time we will do it urselves

In [22]:
m = len(seqs)//bs
m,bs,len(seqs)

(328, 64, 21031)

In [23]:
def group_chunks(ds, bs):
  m = len(ds)//bs
  new_ds = L()
  for i in range(m): new_ds += L(ds[i+m*j] for j in range(bs))
  return new_ds

In [24]:
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs = bs, drop_last=True, shuffle=False
)

In [25]:
# we will use a little tweak using thetraining lopp via a callback

learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.677074,1.827367,0.467548,00:02
1,1.282722,1.870913,0.388942,00:02
2,1.090705,1.651793,0.4625,00:02
3,1.005092,1.613794,0.516587,00:02
4,0.965975,1.560775,0.551202,00:02
5,0.916182,1.595857,0.560577,00:02
6,0.897657,1.539733,0.574279,00:02
7,0.836274,1.585141,0.583173,00:02
8,0.805877,1.629808,0.586779,00:02
9,0.795096,1.651267,0.588942,00:02


## Creating more signal

using more targets andcomparing them for intermediate predictions

we will predict enxt word for every single word

In [26]:
# instead of 3 we will use 16 length sequence
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1])) for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut],bs), group_chunks(seqs[cut:], bs), bs=bs, drop_last=True, shuffle=False )

In [27]:
# looking at the first element of the sequence

[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [28]:
class LMModel4(Module):
  def __init__(self,vocab_sz,n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden,n_hidden)
    self.h_o = nn.Linear(n_hidden,vocab_sz)
    self.h = 0

  def forward(self, x):
    outs = []
    for i in range(sl):
      self.h = self.h + self.i_h(x[:,i])
      self.h = F.relu(self.h_h(self.h))
      outs.append(self.h_o(self.h))
    self.h = self.h.detach()
    return torch.stack(outs, dim=1)
  
  def reset(self):
    self.h = 0

In [29]:
def loss_func(inp, targ):
  return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [30]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func, metrics = accuracy, cbs = ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.285931,3.072032,0.212565,00:00
1,2.330371,1.969522,0.425781,00:00
2,1.742317,1.841378,0.441488,00:01
3,1.47012,1.810857,0.494303,00:00
4,1.297998,1.867457,0.480062,00:00
5,1.175093,1.773528,0.501709,00:00
6,1.071012,1.714759,0.512044,00:00
7,0.976119,1.711606,0.545573,00:00
8,0.88883,1.707983,0.562663,00:01
9,0.821714,1.632179,0.581787,00:00


## Multi layer RNNs

for multi layer RNN we pass the activations from our recurrent network into a second reco=urrent pipeline

In [31]:
class LMModel5(Module):
  def __init__(self, vocab_sz, n_hidden,n_layers):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = torch.zeros(n_layers, bs, n_hidden)
  
  def forward(self, x):
    res,h = self.rnn(self.i_h(x), self.h) # for multi layer ?
    self.h = h.detach()
    return self.h_o(res)
  
  def reset(self):
    self.h.zero_()

In [32]:
learn = Learner(dls, LMModel5(len(vocab), 64,2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cb=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.04179,2.549923,0.455729,00:01
1,2.128606,1.716949,0.469808,00:01
2,1.700232,1.870062,0.340169,00:01
3,1.513695,1.767692,0.423584,00:01
4,1.358153,1.714393,0.503906,00:01
5,1.229992,1.793889,0.510173,00:01
6,1.110304,1.937699,0.532959,00:01
7,1.000071,2.01861,0.544759,00:01
8,0.907561,1.978048,0.553874,00:01
9,0.829595,2.01686,0.560954,00:01


## Exploding and disappearing activation

the accuracy is less than prevoius accuracy

this happens when you multiply a number by a number slightly more than 1 or less than 1 then the number cimpletely dissapear or explides.

since computer use float the bumber becaomes more inaccurate as we move outside zero.

inorder to reduce it we use LSTM memory and GRU


## LSTM

Long short term memory. it has two hidden states one the normal one that we are using :

Having the right information for the output layer and retaining memory for everything that happened in the sentence. 

However as it turns out RNNs are not very good at reatining memory thats why theyintroduced a hidden memory long short term memory.

In [33]:
class LSTMCell(Module):
  def __init__(self, ni,nh):
    self.forget_gate = nn.Linear(ni + nh,nh)
    self.input_gate = nn.Linear(ni + nh, nh)
    self.cell_gate = nn.Linear(ni + nh, nh)
    self.output_gate = nn.Linear(ni +nh,nh)

  def forward(self,input, state):
    h,c = static
    h = torch.stack([h, input], dim=1)
    forget = torch.sigmoid(sef.forget_gate(h))
    c = c * forget
    inp = torch.sigmoid(self.input_gate(h))
    cell = torch.tanh(self.cell_gate(h))
    c = c + inp * cell 
    out = torch.sigmoid(self.output_gate(h))
    h = outgate * torch.tanh(c)
    return h, (h,c)

In [34]:
#refactoring

class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih = nn.Linear(ni,4*nh)
        self.hh = nn.Linear(nh,4*nh)

    def forward(self, input, state):
        h,c = state
        # One big multiplication for all the gates is better than 4 smaller ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()

        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

In [35]:
t = torch.arange(0,10); t

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [36]:
t.chunk(2)

(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))

In [41]:
class LMModel6(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers):
    self.i_h = nn.Embedding(vocab_sz,n_hidden)
    self.rnn = nn.LSTM(n_hidden,n_hidden,n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = [torch.zeros(n_layers,bs, n_hidden) for _ in range(2)]

  def forward(self, x):
    res,h = self.rnn(self.i_h(x), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.h_o(res)
  
  def reset(self):
    for h in self.h:
      h.zero_()

In [42]:
learn = Learner(dls, LMModel6(len(vocab), 64,2),
                 loss_func=CrossEntropyLossFlat(),
                 metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.026113,2.772102,0.153076,00:01
1,2.216185,2.089064,0.269124,00:01
2,1.613937,1.826248,0.47876,00:01
3,1.316856,2.086804,0.501546,00:01
4,1.093758,2.008277,0.610026,00:01
5,0.854747,1.814557,0.663005,00:01
6,0.612584,1.935545,0.707845,00:01
7,0.42109,1.706476,0.74349,00:01
8,0.274809,1.597815,0.762777,00:01
9,0.180654,1.717633,0.791585,00:01


## Regularisation 
using dropouts

In [43]:
class Dropout(Module):
  def __init(self, p):
    self.p = p
  def forward(self, x):
    if not self.training:
      return x
    mask = x.new(*x.shape).bernoulli_(1-p)
    return x * mask.div_(1-p)

## Training a weight tird Regularized LSTM

In [44]:
# combining dropout with Activation Regularisation and Temporal Regularisation

class LMModel7(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers, p):
    self.i_h = nn.Embedding(vocab_sz,n_hidden)
    self.rnn = nn.LSTM(n_hidden,n_hidden, n_layers, batch_first=True)
    self.drop = nn.Dropout(p)
    self.h_o = nn.Linear(n_hidden,vocab_sz)
    self.h_o.weight = self.i_h.weight
    self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
  
  def forward(self, x):
    raw, h = self.rnn(self.i_h(x), self.h)
    out = self.drop(raw)
    self.h = [h_.detach() for h_ in h]
    return self.h_o(out), raw, out
  
  def reset(self):
    for h in self.h:
      h.zero_()

In [45]:
learn = Learner(dls, LMModel7(len(vocab), 64, 2,0.5), loss_func=CrossEntropyLossFlat(),
                 metrics=accuracy, cbs=[ModelResetter , RNNRegularizer(alpha=2, beta=1)])

In [46]:
# same as 

learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [47]:
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.491892,1.93219,0.512288,00:02
1,1.591766,1.156541,0.644287,00:02
2,0.912455,0.704656,0.785889,00:01
3,0.539007,0.689711,0.796468,00:01
4,0.365283,0.531121,0.840658,00:02
5,0.255364,0.517289,0.843587,00:01
6,0.202057,0.422045,0.871419,00:01
7,0.170183,0.422867,0.876546,00:01
8,0.148466,0.441083,0.866618,00:01
9,0.13616,0.428933,0.872721,00:01
