# Language Model from scratch

Here we create our own Vocab and iterator without using torchtext or any other library.


In [None]:
!wget -q https://github.com/pytorch/examples/raw/master/word_language_model/data/wikitext-2/train.txt
!wget -q https://github.com/pytorch/examples/raw/master/word_language_model/data/wikitext-2/test.txt
!wget -q https://github.com/pytorch/examples/raw/master/word_language_model/data/wikitext-2/valid.txt

In [None]:
!ls

sample_data  test.txt  train.txt  valid.txt


In [None]:
import os
import torch
from io import open
from pathlib import Path
import  torch.nn as nn
import time
import math
import os

## Data

In [None]:
class Dictionary(object):
    """The class which holds the mapping from word2idx and idx2word.
    
    """
    def __init__(self):
        self.word2idx={}
        self.idx2word=[]
    
    def add_word(self,word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word]= len(self.idx2word)-1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.idx2word)
    
class Corpus(object):
    """The class which holds all the three data sets.
    We maintain on Single vocab for all train, test, val.
    
    """
    def __init__(self,path):
        self.dictionary = Dictionary()
        
        self.train = self.tokenize(path/"train.txt") # tokenize the data
        self.test  = self.tokenize(path/"valid.txt")
        self.valid = self.tokenize(path/"test.txt")
        return None
    def tokenize(self,path):
        
        with open(path,"r",encoding="utf8") as f:
            tokens=0
            for line in f:
                words = line.split() + ["<eos>"]
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
                    
        #tokenize file content
        with open(path , "r" , encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token=0
            for line in f:
                words = line.split() +["<eos>"]
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token+=1
                    
        
        return ids

In [None]:
class RNNModel(nn.Module):
    
    def __init__(self,rnn_type,
                 ntoken,
                 ninp,
                 nhid ,
                 nlayers , 
                 dropout =0.5):
        super(RNNModel,self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken,ninp)
        self.rnn = getattr(nn,rnn_type)(ninp,nhid,nlayers,dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)
        self.init_weights()
        self.rnn_type=rnn_type
        self.nhid=nhid
        self.nlayers=nlayers
        return None
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange,initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange,initrange)
    
    def forward(self,input,hidden):
        emb  = self.drop(self.encoder(input))
        output,hidden = self.rnn(emb,hidden)
        print(output.shape)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.shape[0]*output.shape[1] , output.shape[2]))
        return decoded.view(output.size(0),output.size(1),decoded.size(1)) , hidden
    
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)
    

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
corpus = Corpus(Path("./"))

In [None]:
def batchify(data , bsz):
    
    nbatch = data.size(0) // bsz
    
    data = data.narrow(0,0,nbatch*bsz)
    
    data = data.view(bsz,-1).t().contiguous()
    
    return data.to(device)

In [None]:
eval_batch_size = 10
train_batch_size = 20
train_data = batchify(corpus.train, train_batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [None]:
train_data.shape 

(torch.Size([104431, 20]), torch.Size([2088628]))

In [None]:
ntokens = len(corpus.dictionary)

In [None]:
model = RNNModel("LSTM", ntokens, ninp = 300, nhid=200, nlayers=2,dropout=0.2).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
bptt=3
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target
data , target = get_batch(train_data,0)
data[:,:10] , target

(tensor([[    0,   284, 15178,   280,   348,   128,   289,  9493,    16,     1],
         [    1,   357,    43,  2977,   530, 23080,    13,    78,    17,     0],
         [    2,  1496,  7369,   115,  4782,    37, 22196,   252, 26998,     0]],
        device='cuda:0'),
 tensor([[    1,   357,    43,  2977,   530, 23080,    13,    78,    17,     0,
           4312,     0,   151,    22, 18215,    17,    17,    46,    43,  2015],
         [    2,  1496,  7369,   115,  4782,    37, 22196,   252, 26998,     0,
          28680,     1,   496,  2193,  1037,     9,  4072,   380,    27, 33001],
         [    3,   449,   310,     9,    13,  8034,  3107,   639,    13, 27958,
            638,     1,   168,    17,    43,  2786,    15,   160,   152,  3072]],
        device='cuda:0'))

In [None]:
# https://discuss.pytorch.org/t/help-clarifying-repackage-hidden-in-word-language-model/226/7
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(bsz=20)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, 35)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % 200 == 0 and batch > 0:
            cur_loss = total_loss / 200
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // 35, lr,
                elapsed * 1000 / 200, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, 35):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)

In [None]:
lr = 20
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, 40+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("madel.pth", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open("madel.pth", 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |   200/ 2983 batches | lr 20.00 | ms/batch 21.31 | loss  7.62 | ppl  2034.50
| epoch   1 |   400/ 2983 batches | lr 20.00 | ms/batch 20.33 | loss  6.83 | ppl   923.59
| epoch   1 |   600/ 2983 batches | lr 20.00 | ms/batch 20.32 | loss  6.47 | ppl   642.53
| epoch   1 |   800/ 2983 batches | lr 20.00 | ms/batch 20.34 | loss  6.27 | ppl   526.87
| epoch   1 |  1000/ 2983 batches | lr 20.00 | ms/batch 20.36 | loss  6.11 | ppl   452.09
| epoch   1 |  1200/ 2983 batches | lr 20.00 | ms/batch 20.32 | loss  6.04 | ppl   418.94
| epoch   1 |  1400/ 2983 batches | lr 20.00 | ms/batch 20.32 | loss  5.92 | ppl   372.74
| epoch   1 |  1600/ 2983 batches | lr 20.00 | ms/batch 20.30 | loss  5.94 | ppl   378.09
| epoch   1 |  1800/ 2983 batches | lr 20.00 | ms/batch 20.30 | loss  5.78 | ppl   325.18
| epoch   1 |  2000/ 2983 batches | lr 20.00 | ms/batch 20.29 | loss  5.75 | ppl   313.59
| epoch   1 |  2200/ 2983 batches | lr 20.00 | ms/batch 20.29 | loss  5.65 | ppl   282.97
| epoch   

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 2983 batches | lr 20.00 | ms/batch 20.85 | loss  5.53 | ppl   252.97
| epoch   2 |   400/ 2983 batches | lr 20.00 | ms/batch 20.83 | loss  5.51 | ppl   247.14
| epoch   2 |   600/ 2983 batches | lr 20.00 | ms/batch 21.04 | loss  5.34 | ppl   207.61
| epoch   2 |   800/ 2983 batches | lr 20.00 | ms/batch 20.74 | loss  5.35 | ppl   211.40
| epoch   2 |  1000/ 2983 batches | lr 20.00 | ms/batch 20.72 | loss  5.33 | ppl   206.64
| epoch   2 |  1200/ 2983 batches | lr 20.00 | ms/batch 20.71 | loss  5.31 | ppl   202.71
| epoch   2 |  1400/ 2983 batches | lr 20.00 | ms/batch 20.90 | loss  5.31 | ppl   202.72
| epoch   2 |  1600/ 2983 batches | lr 20.00 | ms/batch 21.10 | loss  5.37 | ppl   215.36
| epoch   2 |  1800/ 2983 batches | lr 20.00 | ms/batch 21.35 | loss  5.24 | ppl   188.17
| epoch   2 |  2000/ 2983 batches | lr 20.00 | ms/batch 21.49 | loss  5.25 | ppl   190.15
| epoch   2 |  2200/ 2983 batches | lr 20.00 | ms/batch 21.63 | loss  5.15 | ppl   172.70
| epoch   

# The End