## RNN Language Model

Sources

[1] 
[2] https://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/



In [1]:
import torch
import torchtext


We have the following datasets available for this task:

- Penn Trebank (originally created for POS tagging)
- WikiText

Before loading our dataset, define how it will be tokenized and preprocessed. To do this, `torchtext` uses `data.Field`. By default, it uses [`spaCy`](https://spacy.io/api/tokenizer) tokenization.

Also, we set an `init_token` and `eos_token` for the begin and end of sentence characters.

In [2]:
from torchtext import data

TEXT = data.Field(
    tokenizer_language='en',
    lower=True,
    init_token='<sos>',
    eos_token='<eos>',
    batch_first=True,
)

Now, we can load our dataset

In [3]:
from torchtext.datasets import WikiText2
 
train, valid, test = WikiText2.splits(TEXT) 

TEXT.build_vocab(train, vectors="glove.6B.300d")

print(f"We have {len(TEXT.vocab)} tokens in our vocabulary")

We have 28914 tokens in our vocabulary


## Iterator


In [59]:
device = "cuda" if torch.cuda.is_available() else "cpu"

BATCH_SIZE = 32
BPTT_LEN = 30

train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    bptt_len=BPTT_LEN, # this is where we specify the sequence length
    device=device,
    repeat=False)

In [156]:
import torch.nn as nn

class RNNDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pad_idx, hidden_size,
                 cell_class=nn.GRU):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_IDX)
        
        self.rnn = cell_class(embedding_dim, hidden_size, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, input):
        """
        Inputs are supposed to be just one step (i.e. one letter)
        """
        # inputs = [batch_size, ]
        emb = model.embedding(input)
        # emb = [batch, embedding_dim]
        hidden, _ = model.rnn(emb)
        # hidden = [batch, hidden_dim]
        out = model.fc(hidden)
        # out = [batch, vocab size]

        return out

Create the Language Model

In [157]:
HIDDEN_DIM = 256

model = RNNDecoder(vocab_size, embedding_dim, hidden_size=HIDDEN_DIM, pad_idx=PAD_IDX)

model.to(device)

RNNDecoder(
  (embedding): Embedding(28914, 300, padding_idx=1)
  (rnn): GRU(300, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=28914, bias=True)
)

In [158]:
vocab_size = TEXT.vocab.vectors.shape[0]
embedding_dim = TEXT.vocab.vectors.shape[1]
PAD_IDX = TEXT.vocab.stoi["<pad>"]
UNK_IDX = TEXT.vocab.stoi["<unk>"]
EOS_IDX = TEXT.vocab.stoi["<eos>"]
SOS_IDX = TEXT.vocab.stoi["<sos>"]

model.embedding.weight.data.copy_(TEXT.vocab.vectors)
model.embedding.weight[UNK_IDX] = torch.randn(embedding_dim)



## Training 

In [172]:

batch = next(iter(train_iter))

criterion = nn.CrossEntropyLoss()

preds = model(batch.text).view(-1, vocab_size)

trg = batch.target.view(-1)
criterion(preds, trg)

tensor(10.2706, device='cuda:0', grad_fn=<NllLossBackward>)

In [None]:
import torch
from sklearn.metrics import accuracy_score, f1_score


def train(model, iterator, optimizer, criterion):
    """
    Trains the model for one full epoch
    """
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text = batch.text

        predictions = model(text)
        loss = criterion(predictions, batch.HS)
        prob_predictions = torch.sigmoid(predictions)
        preds = torch.round(prob_predictions).detach().cpu()

        acc = accuracy_score(preds, batch.HS.cpu())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Evaluates the model on the given iterator
    """
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        predicted_probas = []
        labels = []
        for batch in iterator:
            text, lens = batch.text
            predictions = model(text, lens)
            prob_predictions = torch.sigmoid(predictions)

            loss = criterion(predictions, batch.HS)

            predicted_probas.append(prob_predictions)
            labels.append(batch.HS.cpu())

            epoch_loss += loss.item()

        predicted_probas = torch.cat(predicted_probas).cpu()
        labels = torch.cat(labels).cpu()

        preds = torch.round(predicted_probas)

        pos_f1 = f1_score(labels, preds)
        neg_f1 = f1_score(1-labels, 1-preds)
        avg_f1 = (pos_f1 + neg_f1) / 2
        acc = accuracy_score(labels, preds)

    return epoch_loss / len(iterator), acc, avg_f1