# Encoder-Only Transformers

In [148]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

import lightning as L

# Create training dataset

In [149]:
# Create a mapping from vocabs to numbers as nn.Embedding can only take integers
token_to_id = {"Edison": 0,
               "is": 1,
               "handsome": 2,
               "but": 3,
               "very": 4,
               "lazy": 5,
               "Danica": 6,
               "loves": 7,
               "doesn't": 8,
               "work": 9,
               "Bekzod": 10,
               "<PAD>": 11}

# Create a mapping from numbers back to vocabs to interpret the output from the transformer
id_to_token = dict(map(reversed, token_to_id.items()))


In [150]:
# Create the training dataset
# As the input is going to be word embeddings, we only need the corresponding numbers from the mapping
inputs = torch.tensor([[token_to_id["Edison"],
                       token_to_id["is"],
                       token_to_id["handsome"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"]], 
                       
                      [token_to_id["Bekzod"],
                       token_to_id["very"],
                       token_to_id["lazy"],
                       token_to_id["doesn't"],
                       token_to_id["work"],
                       token_to_id["<PAD>"]],
                       
                      [token_to_id["Edison"],
                       token_to_id["is"],
                       token_to_id["lazy"],
                       token_to_id["but"],
                       token_to_id["handsome"],
                       token_to_id["<PAD>"]],
                        
                      [token_to_id["Edison"],
                       token_to_id["loves"],
                       token_to_id["work"],
                       token_to_id["but"],
                       token_to_id["lazy"],
                       token_to_id["<PAD>"]],
                       
                      [token_to_id["Edison"],
                       token_to_id["is"],
                       token_to_id["lazy"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"]],
                        
                      [token_to_id["but"],
                       token_to_id["Bekzod"],
                       token_to_id["doesn't"],
                       token_to_id["work"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"]],
                       
                      [token_to_id["Danica"],
                       token_to_id["doesn't"],
                       token_to_id["loves"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"],
                       token_to_id["<PAD>"]]])

# 0-Negative; 1-Positive
labels = torch.tensor([1, 0, 1, 0, 0, 0, 0])

dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

# Position Encoding

The formula for the (standard, used in the paper **Attention is all you need**) position encoding is:  
PE_(pos, 2i) = sin(pos / 10000^(2i / d_model))  
PE_(pos, 2i+1) = cos(pos / 10000^(2i / d_model))  


In [151]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model=2, max_len=6):

        super().__init__()

        # pe stands for position encoding
        pe = torch.zeros(max_len, d_model)

        # position is a column matrix (2D) of size [max_len, 1], e.g. [[0.], [1.], [2.]]
        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)

        # Step is set to 2 because of "2i" in the formula, note that it is a 1D tensor, e.g. [0., 2.] as each position can have multiple embedding values
        embedding_index = torch.arange(start=0, end=d_model, step=2).float()

        # div_term is a row matrix (1D) with the same size as embedding_index
        div_term = torch.tensor(10000.)**(embedding_index / d_model)

        # Note: calculating the sin and cos values in this way only works when d_model is an even number, if d_model is odd, there will be a shape mismatch
        pe[:, 0::2] = torch.sin(position / div_term)
        pe[:, 1::2] = torch.cos(position / div_term)

        self.register_buffer('pe', pe)
    

    def forward(self, word_embeddings):

        # Note: we might not need all the position encodings, as the number of tokens might not hit the maximum length (max_len)
        return word_embeddings + self.pe[:word_embeddings.size(0), :]


# Self-Attention

In [152]:
class Attention(nn.Module):

    def __init__(self, d_model=2):

        super().__init__()

        # Create the weights associated with the query, key and value values
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):

        # Create the Q, K and V matrices
        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)

        # Calculate the similarity score between the query values and key values
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        # Scale the similarity score with the square root of d_model
        scaled_sims = sims / torch.tensor((k.size(self.col_dim))**0.5)

        device = scaled_sims.device
        
        # Mask the scaled similarity scores of the later tokens so that the earlier tokens can't cheat. Note: -1e9 is an approximation of negative infinity
        if mask is not None:
            # Move your mask to mps:0, or mask would live in cpu by default
            mask = mask.to(device)
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)

        # Applying the softmax function to the scaled similarites determines the percentages of influence each token (in columns) should have on the others (in rows)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        # attention_scores are basically the contextualised embeddings
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores


# Multi-Head Attention

In [153]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model=2, head=2):

        super().__init__()

        # Number of heads
        self.head = head

        self.attention = Attention(d_model=d_model)

        # Linear layer to aggregate the attention scores from different heads
        self.fc = nn.Linear(in_features=head*d_model, out_features=d_model)
        
    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):

        # List to store each head's attention values
        agg_attention_values = []
        
        # Create attention heads
        for h in range(self.head):

            attention_values_h = self.attention(encodings_for_q,
                                                encodings_for_k,
                                                encodings_for_v,
                                                mask=mask)

            agg_attention_values.append(attention_values_h)

        # Concatenate them as an input to the linear layer
        agg_attention_values = torch.cat(agg_attention_values, dim=1)

        # Run them through a linear layer
        # Note: this is not the traditional approch of how multi-head attention
        # print("the size of agg_attention_values is: ", agg_attention_values.size())
        fc_layer_out = self.fc(agg_attention_values)

        return fc_layer_out


# Encoder-only Transformer

In [154]:
class EncoderOnlyTransformer(L.LightningModule):

    def __init__(self, num_tokens, d_model, head, max_len):

        super().__init__()

        # Word Embeddings
        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)

        # Position Encodings
        self.pe = PositionEncoding(d_model=d_model, max_len=max_len)

        # Multi-Head Attention
        self.multi_head_attention = MultiHeadAttention(d_model=d_model, head=head)

        # Classification head, out_features=2 because it's a binary classification task
        self.cls = nn.Linear(in_features=max_len*d_model, out_features=2)

        # Calculate the loss with Cross Entropy; softmax is already included
        self.loss = nn.CrossEntropyLoss()

    # The size of token_ids just needs to be a 1D tensor (without batching), unlike nn.LSTM, which requires the size of the input tensor to be [seq_len, batch_size, input_size]
    def forward(self, token_ids):

        # Create word embeddings
        word_embeddings = self.we(token_ids)

        # Add position encodings to the word embeddings
        position_encoded = self.pe(word_embeddings)

        # Multi-head Attention
        self_attention_values = self.multi_head_attention(position_encoded,
                                                          position_encoded,
                                                          position_encoded,
                                                          mask=None)
        
        # Add residual connections
        # The shape of the residual_connection_values is [max_len, d_model]
        residual_connection_values = position_encoded + self_attention_values

        # Note: We need to concatenate the contextualised embeddings of the tokens in the residual connection values
        # before passing it into the linear head
        cls_input = residual_connection_values.flatten()
        # It will return the logits
        fc_layer_out = self.cls(cls_input)

        return fc_layer_out
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx):
        
        # input_tokens is a 2D tensor of size [batch_size, seq_len]
        input_tokens, labels = batch
        # print(batch, '\n')
        # print(input_tokens[0])
        # outputs is fc_layer_out, so they share the same size
        outputs = self.forward(input_tokens[0])
        # Cross Entropy loss will automatically apply softmax to the outputs
        loss = self.loss(outputs, labels[0])

        return loss


In [176]:
# Before we train the model, let's see what the model outputs for fun
model = EncoderOnlyTransformer(num_tokens=len(token_to_id), d_model=2, head=2, max_len=6)

model_input = torch.tensor([token_to_id["Bekzod"],
                            token_to_id["doesn't"],
                            token_to_id["work"],
                            token_to_id["<PAD>"],
                            token_to_id["<PAD>"],
                            token_to_id["<PAD>"]])

input_length = model_input.size(dim=0)

# predictions is a 1D tensor of size [2] that contains the raw scores (logits) of the two classes
predictions = model(model_input)
# print(predictions)
# Need to do argmax explicitly since the cross entropy loss is not used in a normal forward pass
predicted_label = torch.argmax(predictions)
print(predicted_label)


tensor(1)


In [177]:
# That means we need to train the model...
trainer = L.Trainer(max_epochs=100)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                 | Type               | Params | Mode 
--------------------------------------------------------------------
0 | we                   | Embedding          | 24     | train
1 | pe                   | PositionEncoding   | 0      | train
2 | multi_head_attention | MultiHeadAttention | 22     | train
3 | cls                  | Linear             | 26     | train
4 | loss                 | CrossEntropyLoss   | 0      | train
--------------------------------------------------------------------
72        Trainable params
0         Non-trainable params
72        Total params
0.000     Total estimated model params size (MB)
/Users/edison/Git/pytorch-playground/myenv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [179]:
# Run the same code after training
model_input = torch.tensor([token_to_id["Bekzod"],
                            token_to_id["doesn't"],
                            token_to_id["work"],
                            token_to_id["<PAD>"],
                            token_to_id["<PAD>"],
                            token_to_id["<PAD>"]])

input_length = model_input.size(dim=0)

# predictions is a 1D tensor of size [2] that contains the raw scores (logits) of the two classes
predictions = model(model_input)
# print(predictions)
# Need to do argmax explicitly since the cross entropy loss is not used in a normal forward pass
predicted_label = torch.argmax(predictions)
print(predicted_label)


tensor(0)


In [196]:
# Test the model with the entire training dataset
predicted_labels = []

for input in inputs:
    predicted_label = torch.argmax(model(input))
    predicted_labels.append(predicted_label.item())

for pair in zip(inputs, predicted_labels):
    input_id, label = pair
    input_token = list(map(lambda id: id_to_token[id.item()], input_id))
    print(f"{input_token}: {label}\n")

['Edison', 'is', 'handsome', '<PAD>', '<PAD>', '<PAD>']: 1

['Bekzod', 'very', 'lazy', "doesn't", 'work', '<PAD>']: 0

['Edison', 'is', 'lazy', 'but', 'handsome', '<PAD>']: 1

['Edison', 'loves', 'work', 'but', 'lazy', '<PAD>']: 0

['Edison', 'is', 'lazy', '<PAD>', '<PAD>', '<PAD>']: 0

['but', 'Bekzod', "doesn't", 'work', '<PAD>', '<PAD>']: 0

['Danica', "doesn't", 'loves', '<PAD>', '<PAD>', '<PAD>']: 0



In [160]:
# Test it with a sentence that is not included in the training dataset
model_input = torch.tensor([token_to_id["Danica"],
                            token_to_id["loves"],
                            token_to_id["Edison"],
                            token_to_id["but"],
                            token_to_id["doesn't"],
                            token_to_id["work"]])

input_length = model_input.size(dim=0)

# predictions is a 1D tensor of size [2] that contains the raw scores (logits) of the two classes
predictions = model(model_input)
# print(predictions)
# Need to do argmax explicitly since the cross entropy loss is not used in a normal forward pass
predicted_label = torch.argmax(predictions)
print(predicted_label)


tensor(0)
