# Transformers

In [26]:
import torch
from torch import nn
import torch.nn.functional as f
import numpy as np

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_Softargmax = nn.Softmax

## Multi-head attention

In [28]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, p, d_input=None):
        """
        Multi-Head Attention Layer
        
        d_model (int):  
        num_heads (int):
        p (int):
        d_input (int): 
        
        """
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        if d_input is None:
            d_xq = d_xk = d_xv = d_model
        else:
            d_xq, d_xk, d_xv = d_input
            
        # Make sure that the embedding dimension of model is a multiple of the number of heads
        assert d_model % self.num_heads == 0
        
        self.d_k = d_model // self.num_heads
        
        # These are still of dimension d_model. They will be split into number of heads
        self.W_q = nn.Linear(d_xq, d_model, bias=False)
        self.W_k = nn.Linear(d_xk, d_model, bias=False)
        self.W_v = nn.Linear(d_xv, d_model, bias=False)
        
        # Outputs of all sub-layers need to be of dimension d_model
        self.W_h = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V):
        batch_size = Q.size(0)
        k_length = K.size(-2)
        
        # Scaling by d_k so that the soft(arg)max doesn't saturate
        Q = Q / np.sqrt(self.d_k) # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(Q, K.transpose(2,3)) # (bs, n_heads, q_length, k_lengt)
        
        A = nn_Softargmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length)
        
        # Get the weighted average of the values
        H = torch.matmul(A, V) # (bs, n_heads, q_length, dim_per_head)
        
        return H, A
    
    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (heads X depth)
        Return after transpose to put in shape (batch_size X num_heads X seq_length X d_k)
        """
        return x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
    
    def group_heads(self, x, batch_size):
        """
        Combine the heads again to get (batch_size X seq_length X (num_heads times d_k))
        """
        return x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
    
    def forward(self, X_q, X_k, X_v):
        """
        Returns:
            H (bs, q_length, dim)
            A ()
        """
        batch_size, seq_length, dim = X_q.size()
        
        # After transforming, split into num_heads
        Q = self.split_heads(self.W_q(X_q), batch_size) # (bs, n_heads, q_length, dim_per_head)
        K = self.split_heads(self.W_k(X_k), batch_size) # (bs, n_heads, k_length, dim_per_head)
        V = self.split_heads(self.W_v(X_v), batch_size) # (bs, n_heads, v_length, dim_per_head)
        
        # Calculate the attention weights for each of the heads
        H_cat, A = self.scaled_dot_product_attention(Q, K, V)
        
        # Put all the heads back together by concat
        H_cat = self.group_heads(H_cat, batch_size) # (bs, q_length, dim)
        
        # Final linear layer
        H = self.W_h(H_cat) # (bs, q_length, dim)
        
        return H, A

### Just a sanity checks

In [29]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8, p=0)
def print_out(Q, K, V):
    temp_out, temp_attn = temp_mha.scaled_dot_product_attention(Q, K, V)
    print('Attention weights are: ', temp_attn.squeeze())
    print('Output is:', temp_out.squeeze())

To check our self attention works - if the query matches with one of the key values, it should have all the attention focused there, with the value returned as the value at that index.

In [30]:
test_K = torch.tensor(
    [[10, 0, 0],
    [0, 10, 0],
    [0, 0, 10],
    [0, 0, 10]]
).float()[None, None]

test_V = torch.tensor(
    [[1, 0, 0],
     [10, 0, 0],
     [100, 5, 0],
     [1000, 6, 0]
    ]
).float()[None, None]

test_Q = torch.tensor(
    [[0, 10, 0]]
).float()[None, None]

print_out(test_Q, test_K, test_V)

Attention weights are:  tensor([3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06])
Output is: tensor([1.0004e+01, 4.0993e-05, 0.0000e+00])


We can see that if focuses on the second key and returns the second value.

If we give a query that matches two keys exactly, it should return the averaged value of the two values of those two keys.

In [31]:
test_Q = torch.tensor([[0, 0, 10]]).float()
print_out(test_Q, test_K, test_V)

Attention weights are:  tensor([1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01])
Output is: tensor([549.9979,   5.5000,   0.0000])


We see that it focuses equally on the 3rd and 4th key and reutnrs the average of their values.

In [32]:
# Now giving all the queries at the same time
test_Q = torch.tensor(
    [[0, 0, 10],
     [0, 10, 0],
     [10, 10, 0],
    ]
).float()[None, None]

print_out(test_Q, test_K, test_V)

Attention weights are:  tensor([[1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01],
        [3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06],
        [5.0000e-01, 5.0000e-01, 1.8633e-06, 1.8633e-06]])
Output is: tensor([[5.5000e+02, 5.5000e+00, 0.0000e+00],
        [1.0004e+01, 4.0993e-05, 0.0000e+00],
        [5.5020e+00, 2.0497e-05, 0.0000e+00]])


### 1D convolution with kernel_size=1

This is basically an MLP with 1 hidden layer and ReLU activation applied to each and every element in the set

In [33]:
class CNN1d(nn.Module):
    def __init__(self, d_model, hidden_dim, p):
        super().__init__()
        self.klconvL1 = nn.Linear(d_model, hidden_dim)
        self.klconvL2 = nn.Linear(hidden_dim, d_model)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.klconvL1(x)
        x = self.activation(x)
        x = self.klconvL2(x)
        return x        

## Transformer encoder

Now we have all components for our Transformer Encoder block shown below.

In [34]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, conv_hidden_dim, p=0.1):
        super().__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads, p)
        self.cnn = CNN1d(d_model, conv_hidden_dim, p)
        
        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        
    def forward(self, x):
        
        # Multi-head attention
        attn_output, _ = self.mha(x, x, x) # (batch_size, input_seq_len, d_model)
        
        # Layer norm after adding the residual connection
        out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
        
        # Feed forward
        cnn_output = self.cnn(out1) # (batch_size, input_seq_len, d_model)
        
        # Second layer norm after adding residual connection
        out2 = self.layernorm2(out1 + cnn_output) # (batch_size, input_seq_len, d_model)
        
        return out2

### Encoder
#### Blocks of N Encoder Layers + Positional encoding + Inut embedding

Self attention by itself does not have any recurrence or convlutions so to make it sensitive to position we must provide additional positional encodings. These are calculated as follows:

$$E(p, 2i) = \sin(p/10000^{2i/d})$$
$$E(p, 2i+1) = \cos(p/10000^(2i/d)$$

In [35]:
def create_sinusoidal_embeddings(nb_p, dim, E):
    E.requires_grad = False
    
    theta = np.array([
        [p / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
        for p in range(nb_p)
    ])
    
    E[:, 0::2] = torch.FloatTensor(np.sin(theta[:, 0::2]))
    E[:, 1::2] = torch.FloatTensor(np.cos(theta[:, 1::2]))
    
    E = E.to(device)
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings, p):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.position_embeddings = nn.Embedding(max_position_embeddings, d_model)
        create_sinusoidal_embeddings(
            nb_p=max_position_embeddings,
            dim=d_model,
            E=self.position_embeddings.weight
        )
        
        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)
        
    def forward(self, input_ids):
        print(f'[Embeddings] input_idx: {input_ids.size()}')
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        # Get word embeddings for each input id
        word_embeddings = self.word_embeddings(input_ids)
        
        # Get position embeddings for each position id
        position_embeddings = self.position_embeddings(position_ids)
        
        # Add them both
        embeddings = word_embeddings + position_embeddings
        
        # Layer norm
        embeddings = self.LayerNorm(embeddings)
        
        return embeddings

In [36]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, ff_hidden_dim, input_vocab_size, maximum_position_encoding, p=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = Embeddings(d_model, input_vocab_size, maximum_position_encoding, p)
        
        self.enc_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, num_heads, ff_hidden_dim, p))
            
    def forward(self, x):
        x = self.embedding(x) # Transform to (batch_size, input_seq_length, d_model)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
            
        return x # (batch_size, input_seq_len, d_model)

In [37]:
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [38]:
tokenizer = get_tokenizer('basic_english')
train_data = AG_NEWS(split='train')

In [39]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [40]:
vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [41]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [42]:
from torch.utils.data import DataLoader

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [43]:
num_class = len(set([label for (label, text) in train_data]))

vocab_size = len(vocab)

conv_hidden_dim = 64
print(f'# class: {num_class}')
print(f'vocab size: {vocab_size}')
print(f'conv_hidden_dim: {conv_hidden_dim}')

# class: 4
vocab size: 95811
conv_hidden_dim: 64


In [44]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, conv_hidden_dim, input_vocab_size, num_answers):
        super().__init__()
        self.encoder = Encoder(num_layers, 
            d_model, 
            num_heads, 
            conv_hidden_dim, 
            input_vocab_size, 
            maximum_position_encoding=10000
        )
        self.dense = nn.Linear(d_model, num_answers)

    def forward(self, x):
        x = self.encoder(x)
        x, _ = torch.max(x, dim=1)
        x = self.dense(x)
        return x

In [45]:
model = TransformerClassifier(num_layers=1, d_model=32, num_heads=2,
                              conv_hidden_dim=64, input_vocab_size=vocab_size, num_answers=2)

model.to(device)

TransformerClassifier(
  (encoder): Encoder(
    (embedding): Embeddings(
      (word_embeddings): Embedding(95811, 32, padding_idx=1)
      (position_embeddings): Embedding(10000, 32)
      (LayerNorm): LayerNorm((32,), eps=1e-12, elementwise_affine=True)
    )
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (mha): MultiHeadAttention(
          (W_q): Linear(in_features=32, out_features=32, bias=False)
          (W_k): Linear(in_features=32, out_features=32, bias=False)
          (W_v): Linear(in_features=32, out_features=32, bias=False)
          (W_h): Linear(in_features=32, out_features=32, bias=True)
        )
        (cnn): CNN1d(
          (klconvL1): Linear(in_features=32, out_features=64, bias=True)
          (klconvL2): Linear(in_features=64, out_features=32, bias=True)
          (activation): ReLU()
        )
        (layernorm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
        (layernorm2): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
   

In [46]:
import time

def train(dataloader, criterion, optimizer, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        print(f'text shape: {text.size()}')
        print(f'text : {text}')
        print(f'label shape: {label.size()}')
        print(f'label: {label}')
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [47]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

In [48]:
EPOCHS = 10
LR = 5
BATCH_SIZE = 64

In [49]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
criterion = nn.CrossEntropyLoss()
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)

split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, collate_fn=collate_batch)

In [52]:
from torchtext import legacy

ImportError: cannot import name 'legacy' from 'torchtext' (/Users/mghifary/.pyenv/versions/3.10.5/lib/python3.10/site-packages/torchtext/__init__.py)

In [50]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, criterion, optimizer, epoch)
    accu_val = evaluate(valid_dataloader, criterion)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

text shape: torch.Size([2767])
text : tensor([ 8775, 31823,  2578,  ...,     7,   950,     1])
label shape: torch.Size([64])
label: tensor([0, 3, 1, 2, 1, 1, 2, 1, 2, 3, 0, 0, 1, 1, 2, 3, 2, 1, 0, 2, 1, 3, 3, 0,
        2, 0, 3, 0, 1, 3, 1, 2, 1, 3, 0, 2, 3, 3, 1, 1, 2, 3, 2, 3, 2, 0, 3, 0,
        1, 2, 1, 0, 0, 1, 3, 2, 3, 1, 2, 2, 2, 1, 1, 1])
[Embeddings] input_idx: torch.Size([2767])


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
def evaluate(data_loader):
    data_iterator = iter(data_loader)
    nb_batches = len(data_loader)
    model.eval()
    acc = 0 
    for batch in data_iterator:
        x = batch.text.to(device)
        y = batch.label.to(device)
                
        out = model(x)
        acc += (out.argmax(1) == y).cpu().numpy().mean()

    print(f"Eval accuracy: {acc / nb_batches}")
    
def train(train_loader, valid_loader):
    
    for epoch in range(epochs):
        train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
        nb_batches_train = len(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0

        for batch in train_iterator:
            x = batch.text.to(device)
            y = batch.label.to(device)
            
            out = model(x)  # ①

            loss = f.cross_entropy(out, y)  # ②
            
            model.zero_grad()  # ③

            loss.backward()  # ④
            losses += loss.item()

            optimizer.step()  # ⑤
                        
            train_acc += (out.argmax(1) == y).cpu().numpy().mean()
        
        print(f"Training loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Training accuracy: {train_acc / nb_batches_train}")
        print('Evaluating on validation:')
        evaluate(valid_loader)

In [None]:
train(train_loader, valid_loader)

In [None]:
evaluate(test_loader)

In [None]:
max_len = 200
text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = data.LabelField(sequential=False, dtype=torch.long)
ds_train, ds_test = legacy.datasets.IMDB.splits(text, label, root='./')
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

NameError: name 'data' is not defined