This notebook demonstrates the implementation of **Learnable Positional Encoding** using PyTorch, integrated into a simple Transformer-based model. A dummy dataset is created to train and evaluate the model. Additionally, we visualize the learned positional encodings and the training loss over epochs.

1. Importing necessary Libraries

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

2. Implementing Learnable Positional Encoding with nn.parameter

In [2]:
class LearnablePositionalEncoding(nn.Module):
    
    def __init__(self, max_len, d_model):
        super().__init__()
        # Initialize positional encodings as a learnable parameter
        self.pos_encoding = nn.Parameter(torch.zeros(1, max_len, d_model))
    
    def forward(self, x):
        return x + self.pos_encoding[:, :x.size(1), :]

d_model = 32  # Dimensionality of the model
max_len = 60  # Maximum sequence length
pos_encoding = LearnablePositionalEncoding(max_len, d_model)
# dummy_input = torch.zeros((1, max_len, d_model))  # Dummy input tensor
# output = pos_encoding(dummy_input)
# print(f'Output shape: {output.shape}')


Output shape: torch.Size([1, 60, 32])


3. Creating a Dummy Dataset

In [3]:
class DummyDataset(Dataset):
    def __init__(self, num_samples=1000, seq_length=10, vocab_size=50):
        """
        Initializes the dataset with random integer sequences.
        
        Args:
            num_samples (int): Number of samples in the dataset
            seq_length (int): Length of each sequence
            vocab_size (int): Size of the vocabulary (number of unique tokens)
        """
        super(DummyDataset, self).__init__()
        self.num_samples = num_samples  # Number of samples in the dataset
        self.seq_length = seq_length  # Length of each sequence
        self.vocab_size = vocab_size  # Size of the vocabulary (number of unique tokens)
        # Generate random integer sequences as dummy data
        self.data = torch.randint(0, vocab_size, (num_samples, seq_length))
    
    def __len__(self):
        return self.num_samples #Returns the total number of samples.
    
    def __getitem__(self, idx):
        """
        Retrieves the input-target pair for a given index.
        
        Args:
            idx (int): Index of the sample
        
        Returns:
            tuple: (input_sequence, target_sequence)
        """
        return self.data[idx, :-1], self.data[idx, 1:]

4. Setting Up the DataLoader

In [4]:
# Parameters
batch_size = 32  
seq_length = 10  
num_samples = 1000  
vocab_size = 50  

# Create Dataset and DataLoader
dataset = DummyDataset(num_samples=num_samples, seq_length=seq_length, vocab_size=vocab_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# for X, Y in dataloader:
#     print(f'Input shape: {X.shape}')   
#     print(f'Target shape: {Y.shape}')
#     break


5. Defining the Transformer Model with Learnable Positional Encoding

In [5]:
class SimpleTransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, num_heads, num_layers, dropout=0.1):
        """
        Initializes the Transformer model with learnable positional encoding.
        
        Args:
            vocab_size (int): Size of the vocabulary
            d_model (int): Dimensionality of the model
            max_len (int): Maximum sequence length
            num_heads (int): Number of attention heads
            num_layers (int): Number of Transformer encoder layers
            dropout (float): Dropout rate
        """
        super(SimpleTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)  # Token embedding layer
        self.pos_encoding = LearnablePositionalEncoding(max_len, d_model)  # Positional encoding layer
        # Transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)  # Output layer to project back to vocabulary size
    
    def forward(self, src):
        embedded = self.embedding(src)  # (batch_size, seq_length, d_model)
        embedded = self.pos_encoding(embedded)  # Add positional encoding
        embedded = embedded.permute(1, 0, 2)  # (seq_length, batch_size, d_model) for transformer
        encoded = self.transformer_encoder(embedded)  # (seq_length, batch_size, d_model)
        encoded = encoded.permute(1, 0, 2)  # (batch_size, seq_length, d_model)
        output = self.fc_out(encoded)  # (batch_size, seq_length, vocab_size)
        return output

# Example usage
vocab_size = 50  
d_model = 32  
max_len = 60  
num_heads = 4  
num_layers = 2
model = SimpleTransformerModel(vocab_size, d_model, max_len, num_heads, num_layers)
print(model)


SimpleTransformerModel(
  (embedding): Embedding(50, 32)
  (pos_encoding): LearnablePositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=32, out_features=50, bias=True)
)




6. Training the Model

In [6]:
def train_model(model, dataloader, epochs=10, learning_rate=0.001):
    #loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    loss_history = []
    
    model.train()  
    for epoch in range(epochs):
        total_loss = 0  
        for X, Y in dataloader:
            optimizer.zero_grad()  
            output = model(X)  
            # Reshape output and target for loss computation
            output = output.view(-1, vocab_size)  
            Y = Y.view(-1) 
            loss = criterion(output, Y) 
            loss.backward() 
            optimizer.step()  
            total_loss += loss.item()  
        avg_loss = total_loss / len(dataloader)  
        loss_history.append(avg_loss)  
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')  
    return loss_history  

# Train the model
epochs = 10  
learning_rate = 0.001  
loss_history = train_model(model, dataloader, epochs=epochs, learning_rate=learning_rate)


Epoch 1/10, Loss: 3.9617
Epoch 2/10, Loss: 3.8970
Epoch 3/10, Loss: 3.8289
Epoch 4/10, Loss: 3.7269
Epoch 5/10, Loss: 3.6386
Epoch 6/10, Loss: 3.5626
Epoch 7/10, Loss: 3.4959
Epoch 8/10, Loss: 3.4390
Epoch 9/10, Loss: 3.3996
Epoch 10/10, Loss: 3.3478
