<a href="https://colab.research.google.com/github/gmum/natural-language-processing-classes/blob/master/lab-9-unsupervised-lm-training/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 9 - Training unsupervised Language Models

## Excercise (1 pt)

Fill the gap in the training loop and eval loop in below code and train an Language Model that obtains at least **111** perplexity on the test set.

Remember to:
-  use gradient clipping with value 0.25
-  use hidden state from previous batch in next batch, to keep the information longer. To do this, instead of initializing the hidden state with 0 each batch, we detach the hidden state from how it was previously produced. If we didn't, the model would try backpropagating all the way to start of the dataset. Use repackage_hidden to deal with that problem.

In [1]:
# Use GPU support!

from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

import time
import os
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

torch.manual_seed(1)

# Use dataset from https://drive.google.com/drive/folders/1e-BUHYY61Vy9AGNuh2nungslO-mYuVox?usp=sharing
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
class Dictionary(object):
    """Build word2idx and idx2word from Corpus(train/val/test)"""
    def __init__(self):
        self.word2idx = {} # word: index
        self.idx2word = [] # position(index): word

    def add_word(self, word):
        """Create/Update word2idx and idx2word"""
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    """Corpus Tokenizer"""
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'ptb.train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'ptb.valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'ptb.test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                # line to list of token + eos
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids
      
      
def batchify(data, bsz, verbose=False):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    # See https://pytorch.org/docs/stable/torch.html#torch.narrow for more explaination
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    # .t() is transposition: https://pytorch.org/docs/stable/torch.html#torch.t
    # the contiguous function doesn't affect your target tensor at all, it just 
    # makes sure that it is stored in a contiguous chunk of memory.
    data = data.view(bsz, -1).t().contiguous()
    if verbose:
      print(data.size())
      for el in data[:50,0]:
        print(corpus.dictionary.idx2word[el.item()])
      
    data = data.cuda()
    return data

# use path to where you store the datasets
corpus = Corpus('/content/gdrive/My Drive/nlp-classes/labs/lab-9')

batch_size = 20
eval_batch_size = 10
train_data = batchify(corpus.train, batch_size, verbose=True)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
ntokens = len(corpus.dictionary)

In [0]:
class LSTMModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.lstm(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        return (torch.zeros(self.nlayers, bsz, self.nhid).cuda(),
                torch.zeros(self.nlayers, bsz, self.nhid).cuda())

model = LSTMModel(ntokens, 150, 150, 1, 0.2)
model.cuda()
print(model)

In [0]:
criterion = nn.CrossEntropyLoss()
criterion.cuda()
seq_len = 30
log_interval = 100

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == torch.Tensor:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)


def get_batch(source, i):
    s_len = min(seq_len, len(source) - 1 - i)
    data = Variable(source[i:i+s_len])
    target = Variable(source[i+1:i+1+s_len].view(-1))
    return data, target


def evaluate(data_source):
    """compute total loss on data_source dataset"""
  
    model.eval() # Turn on evaluation mode which disables dropout.
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, seq_len):
      
      
      
        """ Write your code here """
      
      
      
    return total_loss[0] / len(data_source)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_len)):
      
      
      
      
      
      
        """ Write your code here """
      
      
      
      
      
      
      
      
      
        total_loss += loss.data

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // seq_len, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
lr = 20
best_val_loss = None
epochs = 40

for epoch in range(1, epochs+1):
      epoch_start_time = time.time()
      train()
      val_loss = evaluate(val_data)
      print('-' * 89)
      print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
      print('-' * 89)

      # Anneal the learning rate if no improvement has been seen in the validation dataset.
      if not best_val_loss or val_loss < best_val_loss:
          best_val_loss = val_loss
      else:
          lr /= 4.0

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)