In [26]:
import math
import torch
from torch import nn
import torch.nn.functional as F

# ABSOLUTE POSITIONAL ENCODING

## In absolute positional encoding each tokens get its own positional embedding

In [55]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        print("even_i:",even_i)
        denominator = torch.pow(10000, even_i/self.d_model)
        print("denominator:",denominator)
        position = (torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1))
        print("position:",position)
        even_PE = torch.sin(position / denominator)
        print("even_PE:",even_PE)
        odd_PE = torch.cos(position / denominator)
        print("odd_PE:",odd_PE)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        print("stacked:",stacked)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [56]:
d_model = 512
max_sequence_length = 300


absolute_pe = PositionalEncoding(d_model, max_sequence_length)


batch_size = 64  
positions = torch.arange(max_sequence_length).unsqueeze(0).expand(batch_size, -1)
positional_encodings = absolute_pe()

even_i: tensor([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.,  20.,  22.,
         24.,  26.,  28.,  30.,  32.,  34.,  36.,  38.,  40.,  42.,  44.,  46.,
         48.,  50.,  52.,  54.,  56.,  58.,  60.,  62.,  64.,  66.,  68.,  70.,
         72.,  74.,  76.,  78.,  80.,  82.,  84.,  86.,  88.,  90.,  92.,  94.,
         96.,  98., 100., 102., 104., 106., 108., 110., 112., 114., 116., 118.,
        120., 122., 124., 126., 128., 130., 132., 134., 136., 138., 140., 142.,
        144., 146., 148., 150., 152., 154., 156., 158., 160., 162., 164., 166.,
        168., 170., 172., 174., 176., 178., 180., 182., 184., 186., 188., 190.,
        192., 194., 196., 198., 200., 202., 204., 206., 208., 210., 212., 214.,
        216., 218., 220., 222., 224., 226., 228., 230., 232., 234., 236., 238.,
        240., 242., 244., 246., 248., 250., 252., 254., 256., 258., 260., 262.,
        264., 266., 268., 270., 272., 274., 276., 278., 280., 282., 284., 286.,
        288., 290., 292., 294., 

In [57]:
positional_encodings

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [ 9.9287e-01, -1.1921e-01, -5.8070e-01,  ...,  9.9949e-01,
          3.0783e-02,  9.9953e-01],
        [ 4.3614e-01, -8.9988e-01, -9.9991e-01,  ...,  9.9949e-01,
          3.0887e-02,  9.9952e-01],
        [-5.2158e-01, -8.5320e-01, -5.5859e-01,  ...,  9.9948e-01,
          3.0990e-02,  9.9952e-01]])

# RELATIVE POSITIONAL ENCODING

## In relative positional encoding each tokens get as many positional embedding as there are tokens

In [52]:
class T5RelativePositionalEncoder(nn.Module):
    def __init__(self, num_heads, max_position=512):
        super(T5RelativePositionalEncoder, self).__init__()
        self.max_position = max_position
        self.embeddings_table = nn.Embedding(max_position*max_position, num_heads)

    def forward(self, seq_len_q, seq_len_k):
        range_vec_q = torch.arange(seq_len_q)
        print("range_vec_q :",range_vec_q)
        range_vec_k = torch.arange(seq_len_k)
        print("range_vec_k :",range_vec_k)
        relative_position = range_vec_k[None, :] - range_vec_q[:, None]
        print("relative_position :",relative_position)
        relative_position_clipped = torch.clamp(relative_position, -self.max_position, self.max_position)
        print("relative_position_clipped :",relative_position_clipped)
        final_mat = relative_position_clipped + self.max_position
        print("final_mat :",final_mat)
        embeddings = self.embeddings_table(final_mat)

        return embeddings

In [53]:

d_model = 512
max_sequence_length = 300


absolute_re = T5RelativePositionalEncoder(d_model, max_sequence_length)

seq_len_q = 10
seq_len_k = 15
batch_size = 64  
positions = torch.arange(max_sequence_length).unsqueeze(0).expand(batch_size, -1)
relative_encodings = absolute_re(seq_len_q,seq_len_k)

range_vec_q : tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
range_vec_k : tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])
relative_position : tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
        [-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
        [-2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
        [-3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
        [-4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
        [-5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
        [-6, -5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8],
        [-7, -6, -5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7],
        [-8, -7, -6, -5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6],
        [-9, -8, -7, -6, -5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5]])
relative_position_clipped : tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
        [-1,  0,  1,  2,  3,  4,  5,  6,  7, 

In [54]:
relative_encodings

tensor([[[-2.0505e-01,  9.9041e-01, -5.2244e-01,  ..., -1.4293e-01,
          -1.6684e+00, -1.3944e+00],
         [ 3.7672e-02, -3.0596e-01, -2.4705e+00,  ..., -6.5935e-01,
           6.7230e-01, -1.9949e-01],
         [-5.3505e-01, -2.9749e-01,  9.6660e-02,  ..., -8.9415e-01,
          -3.7038e-01,  4.6801e-01],
         ...,
         [ 1.4012e+00,  3.2748e-02,  9.2254e-01,  ...,  2.7417e+00,
           1.4758e+00,  9.2321e-01],
         [ 1.2565e+00, -1.0146e+00,  7.4777e-01,  ...,  1.0888e+00,
           8.8507e-01,  3.6969e-01],
         [ 1.6324e+00,  1.3988e+00, -1.3409e+00,  ..., -3.8702e-01,
           1.2695e+00, -7.6887e-02]],

        [[-2.7509e-01, -1.0364e-02, -6.5098e-01,  ...,  6.7114e-01,
          -8.9518e-01, -4.6780e-01],
         [-2.0505e-01,  9.9041e-01, -5.2244e-01,  ..., -1.4293e-01,
          -1.6684e+00, -1.3944e+00],
         [ 3.7672e-02, -3.0596e-01, -2.4705e+00,  ..., -6.5935e-01,
           6.7230e-01, -1.9949e-01],
         ...,
         [ 2.8737e-02, -2

### But why use relative positional encoding ?

- Relative positional encoding can be beneficial in scenarios where the model needs to capture dependencies between tokens that are based on their relative positions rather than their absolute positions. This can be especially useful in tasks where the order of tokens in a sequence is important but not necessarily fixed, such as language modeling or time series forecasting.
- 
The skewing operation in Srel is designed to model these relative dependencies efficiently. By introducing relative positional embeddings, the mode l can learn to attend to tokens that are at different distances from the current token, which can help capture long-range dependencies
.

In contrast, absolute positional encoding provides a fixed embedding for each position in the sequence, but it doesn't inherently capture relative positional information. The use of relative positional encoding can potentially improve the model's performance on tasks where relative positions matter.