# Pytorch Text - Language modeling
Notebook for following along with Pytorch Text interpretation tutorial, starting with nn.transformer and torchtext [Pytorch](https://pytorch.org/tutorials/beginner/transformer_tutorial.html) website tutorial.

### Choices for data

<br>

### Libaries and Modules
Importing the necessary libaries and modules for the notebook.

In [1]:
#Import cell
import captum
import copy
import json
import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import math
import numpy as np
import os, sys
import pandas as pd
import pickle as pk
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

from typing import Tuple
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

#device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device = 'cpu' #Cuda having issues on PC, so manual setting to cpu
print(f"Device: {device}")


print("Imports complete")

Device: cpu
Imports complete


<br>

### Importing and preparing data sets
Importing and preparing the data for the models.

In [2]:
#Gather datasets and prepare them for consumption
train_iter = WikiText2(split='train') #"consumed to make vocab"
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensors."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t:t.numel() > 0, data)))


In [3]:
#Importing data sets
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

print("Data sets successfully imported.")

Data sets successfully imported.


In [4]:
#Loader definitions
def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz seperate sequences, removing extra elements that wouldn't
        cleanly fit.   
    Args: 
        data: Tensor, shape [N]
        bsz: int, batch size
        
    Returns:
        Tensor of shap [N // bsz, bsz]"""
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, eval_batch_size)
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)
print(f"Loaders defined, running on device: {device}")

Loaders defined, running on device: cpu


In [5]:
#Setting seed value
torch.manual_seed(1247)

<torch._C.Generator at 0x1f28b749030>

<br>

### Class Definitions
<b>Classes:</b><br>
<ul>
    <li>TransformerModel - Language interpretting model.</li>
    <li>PositionalEncoding - Injects information about the relative or absolute position of tokens in the sequence.</li>
</ul>

In [6]:
#Class definition cell
class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                nlayers: int, dropout: float = 0.5) -> None:
        super().__init__()
        self.model_type = "Transformer"
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)
        self.init_weights()
        return None
    
    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        return None
    
    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """Args: 
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]
        Returns:
            output Tensor, shape [seq_len, btch_size, ntoken]"""
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        print(src[0])
        output = self.transformer_encoder(src, src_mask)
        print(output[0])
        output = self.decoder(output)
        return output
        
        
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0)/d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position*div_term)
        pe[:, 0, 1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe)
        return None
    
    def forward(self, x: Tensor) -> Tensor:
        """Args: 
            x: Tensors, shape[seq_len, batch_size, embedding_dim]"""
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
        
print("Classes defined.")

Classes defined.


<br>

### Calculation functions
<b>Functions:</b><br>
<ul>
    <li>get_batch - generates a pair of input-target sequences for the transformer model. It subdivides the source data into chunks of length bptt. For language modelling, the model needs the following words as Target.</li>
    <li>generate_square_subsequent_mask - generates an upper triangular matrix of -inf with zeros on the diagonal.
</ul>

In [7]:
#Calculation functions cell
bptt = 35

def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int
    Returns:
        tuple (data, target), where data has shape [seq)len, batch)size]
            and target has shape [seq_len * batch_size]"""
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

def generate_square_subsequent_mask(sz: int) -> Tensor:
        """Generates an upper-triangular matrix of -inf, with zeros on diag."""
        return torch.triu(torch.ones(sz, sz) * float('inf'), diagonal=1)

print("Calculation functions defined.")

Calculation functions defined.


<br>

### Plotting functions
<b>Functions:</b>
<ul>
    <li></li>
</ul>

In [8]:
#Plotting functions Cell

print("Plotting functions defined.")

Plotting functions defined.


<br>

### Main code
#### Instantiating the model

In [9]:
ntokens = len(vocab)
emsize = 200
d_hid = 200
nlayers = 2
nhead = 2
dropout = 0.2
model = TransformerModel(ntokens, emsize, nhead, d_hid,
    nlayers, dropout).to(device)

#### Running the model
Here CrossEntropyLoss with a stochastic gradient descent optimizer will be used, with an initial learning rate of 5.0.

In [10]:
criterion = nn.CrossEntropyLoss()
lr = 5.0
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [11]:
def train(model: nn.Module) -> None:
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    
    num_batches = len(train_data)//bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        seq_len = data.size(0)
        if seq_len != bptt: #only on last batch
            src = src_mask[:seq_len, :seq_len]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        total_loss += loss.item()
        if batch%log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time)*1000/log_interval
            cur_loss = total_loss/log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | ' 
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | ' 
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()
    return None
    
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            seq_len = data.size(0)
            if seq_len != bptt:
                src_mask = src_mask[:seq_len, :seq_len]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += seq_len * criterion(output_flat, targets).item()
    print(output)
    return total_loss / (len(eval_data) - 1)

In [12]:
src_mask = generate_square_subsequent_mask(bptt).to(device)
with torch.no_grad():
    data, targets = get_batch(val_data, 0)
    seq_len = data.size(0)
    output = model(data, src_mask)

print(len(output), output[0][0][0])

tensor([[-0.8809,  0.0000,  0.7953,  ...,  0.0911, -1.3320,  2.4423],
        [-0.2253,  0.8223,  0.0000,  ...,  2.4156,  0.0000, -0.1998],
        [ 1.6933,  1.8127, -1.0218,  ...,  0.0000,  0.7724,  0.0000],
        ...,
        [-0.3732,  1.3096, -1.0327,  ...,  0.8822,  1.5489,  2.9052],
        [-0.0000,  0.6479,  0.0000,  ...,  2.2962,  1.1345,  2.6929],
        [ 0.9261, -0.3708, -0.9554,  ...,  0.0000, -1.7135,  0.4621]])
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]])
35 tensor(nan)


In [13]:
evaluate(model, val_data[0:21])

tensor([[-0.7047,  0.5901,  0.6362,  ...,  0.0729, -1.0656,  1.9538],
        [-0.1803,  0.6579,  0.3755,  ...,  1.9325,  1.3520, -0.1598],
        [ 1.3546,  1.4502, -0.8175,  ...,  1.4399,  0.6179,  1.1222],
        ...,
        [-0.2986,  1.0477, -0.8262,  ...,  0.7058,  1.2391,  2.3242],
        [-0.7235,  0.5183,  1.0280,  ...,  1.8370,  0.9076,  2.1543],
        [ 0.7409, -0.2966, -0.7643,  ...,  2.0375, -1.3708,  0.3697]])
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]])
tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  .

nan

In [17]:
model.transformer_encoder

TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
      )
      (linear1): Linear(in_features=200, out_features=200, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (linear2): Linear(in_features=200, out_features=200, bias=True)
      (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.2, inplace=False)
      (dropout2): Dropout(p=0.2, inplace=False)
    )
    (1): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
      )
      (linear1): Linear(in_features=200, out_features=200, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (linear2): Linear(in_features=200, out_features=

In [14]:
best_val_loss = float('inf')
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-'*90)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f} | '
         f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-'*90)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)
        
    scheduler.step()

tensor([[-0.8809,  0.0000,  0.7953,  ...,  0.0911, -1.3320,  2.4423],
        [-0.8758,  1.0793,  0.0000,  ...,  0.0000,  0.3032,  2.0383],
        [ 0.1468,  1.2471, -0.0000,  ...,  1.0128, -0.7385,  0.2797],
        ...,
        [-1.1711,  0.5210, -0.0000,  ...,  2.2163,  1.4608,  0.5281],
        [ 1.7513,  0.2725, -0.3854,  ...,  1.2965, -0.8597,  1.4617],
        [-0.6098,  0.0000,  0.0000,  ...,  0.9786,  1.5738,  0.0000]],
       grad_fn=<SelectBackward0>)
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<SelectBackward0>)
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan,

KeyboardInterrupt: 

<br>