Language Modeling Task: assign a probbaility for the likelihood of a given word to follow a sequence of words.

In [8]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import time

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        # multi-head attention defines how much focus should be given to each token when aggregating information from the sequence
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        # pass the transformer encoder layers to transformer encoder
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) # it has multiple layers of TransformerEncoderLayer (see above)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        # to produce a probability distribution over output words
        self.linear = nn.Linear(d_model, ntoken) # produce a unnormalized logits (later passed to cross entropy loss - so we do not use softmax)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        src = self.embedding(src) * math.sqrt(self.d_model) # sequence of tokens are passed to embedding layer
        src = self.pos_encoder(src) # accound for the order of the word
        if src_mask is None: # the mask is necessary because any tokens on the future positions should be masked
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(src.device)
        # predictions for position i can depend only on the known outputs at positions less than i.
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

In [9]:
# to inject info about the position of tokens in the sequence
# the encodings have the same dim as the embeddings so they can be summed
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

Load and Batch Data

In [10]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

train_file = 'train.txt'
test_file = 'test.txt'

tokenizer = get_tokenizer('basic_english')

# Build vocabulary from train data
with open(train_file, 'r') as train_iter:
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
    vocab.set_default_index(vocab['<unk>'])

# Process train data and split into train and validation sets
with open(train_file, 'r') as train_iter:
    train_data_raw = train_iter.readlines()

split_ratio = 0.8
train_size = math.floor(len(train_data_raw) * split_ratio)
train_data_raw, val_data_raw = train_data_raw[:train_size], train_data_raw[train_size:]

train_data = data_process(train_data_raw)
val_data = data_process(val_data_raw)

# Process test data
with open(test_file, 'r') as test_iter:
    test_data = data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# it arranges the data into batch_size columns (trimmed if not evenly divisible)
def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]`` """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

In [11]:
bptt = 35
# it generates a pair of input-target sequences, subdiving data into chucks of lenght bptt
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i + seq_len]
    target = source[i + 1:i + 1 + seq_len].reshape(-1)
    return data, target

In [12]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)



In [13]:
criterion = nn.CrossEntropyLoss()
lr = 5.0
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        output = model(data)
        output_flat = output.view(-1, ntokens)
        loss = criterion(output_flat, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [14]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += seq_len * criterion(output_flat, targets).item()

            _, predicted = torch.max(output_flat, 1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    return total_loss / (len(eval_data) - 1), correct / total



best_val_loss = float('inf')
epochs = 3

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss,_ = evaluate(model, val_data)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

test_loss,accuracy = evaluate(model, test_data)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)

print(f'Accuracy: {accuracy*100}%')

| epoch   1 |   200/ 2335 batches | lr 5.00 | ms/batch 846.72 | loss  8.03 | ppl  3057.10
| epoch   1 |   400/ 2335 batches | lr 5.00 | ms/batch 847.97 | loss  6.80 | ppl   895.62
| epoch   1 |   600/ 2335 batches | lr 5.00 | ms/batch 852.85 | loss  6.48 | ppl   654.83
| epoch   1 |   800/ 2335 batches | lr 5.00 | ms/batch 855.21 | loss  6.33 | ppl   563.05
| epoch   1 |  1000/ 2335 batches | lr 5.00 | ms/batch 845.76 | loss  6.16 | ppl   471.50
| epoch   1 |  1200/ 2335 batches | lr 5.00 | ms/batch 845.50 | loss  6.15 | ppl   470.81
| epoch   1 |  1400/ 2335 batches | lr 5.00 | ms/batch 849.73 | loss  6.09 | ppl   442.77
| epoch   1 |  1600/ 2335 batches | lr 5.00 | ms/batch 836.72 | loss  6.06 | ppl   429.03
| epoch   1 |  1800/ 2335 batches | lr 5.00 | ms/batch 843.18 | loss  6.00 | ppl   404.87
| epoch   1 |  2000/ 2335 batches | lr 5.00 | ms/batch 871.05 | loss  5.99 | ppl   400.50
| epoch   1 |  2200/ 2335 batches | lr 5.00 | ms/batch 867.92 | loss  5.94 | ppl   378.39
----------