In [None]:
import torch
import torch.nn as nn

class TransformerConfig:
    def __init__(self, d_model=256, num_heads=4, num_layers=6, vocab_size=10000, max_len=5000, dropout=0.1):
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.dropout = dropout

class AbsolutePositionalEncoding(nn.Module):
    """
    Implements the sinusoidal positional encoding described in "Attention Is All You Need", Vasani et. al. [2017]

    pi()   = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
    """
    def __init__(self, config):
        super().__init__()

        self.max_len = config.max_len
        self.d_model = config.d_model
        self.dropout = nn.Dropout(config.dropout)

        pe = torch.zeros(self.max_len, self.d_model)
        pos_within_vector = torch.arange(0, self.max_len, dtype=float).unsqueeze(1)
        every_other_dim = torch.arange(0, self.d_model, 2, dtype=float)
        '''
        a = 2i/d_model
        -ln(10000^a) = -a ln(10000) 
        exp(ln(10000^-a)) = 10000^(-a) 
        '''
        div_term = torch.exp((math.log(10000.0)) * -every_other_dim / d_model)
        pe[:, 0::2] = torch.sin(pos_within_vector * div_term)
        pe[:, 1::2] = torch.cos(pos_within_vector * div_term)
        pe = pe.unsqueeze(0)

        # Register as a buffer (not a parameter)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        x shape: [batch_size, seq_len, d_model]
        """
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return self.dropout(x)

class TransformerEncoder(nn.Module):
    """
    A simple Transformer encoder block.
    """
    def __init__(self, config):
        super().__init__()
        self.embed = nn.Embedding(config.vocab_size, config.d_model)
        self.pos_encoding = PositionalEncoding(config)

class Transformer(nn.Module):
    """
    A simple Transformer model, as in Vaswani et al. (2017).
    """
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)

In [71]:
'''
Absolute Positional Encoding
'''

import torch
import math

max_len = 100
d_model = 12
pe = torch.zeros(max_len, d_model)
pos_within_vector = torch.arange(0, max_len, dtype=float).unsqueeze(1)
every_other_dim = torch.arange(0, d_model, 2, dtype=float)
'''
a = 2i/d_model
-ln(10000^a) = -a ln(10000) 
exp(ln(10000^-a)) = 10000^(-a) 
'''
div_term = torch.exp((math.log(10000.0)) * -every_other_dim / d_model)
print("pos_within_vector.shape", pos_within_vector.shape)
print("div_term.shape", div_term.shape)
pe[:, 0::2] = torch.sin(pos_within_vector * div_term)
pe[:, 1::2] = torch.cos(pos_within_vector * div_term)
pe = pe.unsqueeze(0)
pe.shape, pe[0]

# pe[0]



pos_within_vector.shape torch.Size([100, 1])
div_term.shape torch.Size([6])


(torch.Size([1, 100, 12]),
 tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  2.1378e-01,  ...,  1.0000e+00,
           4.6416e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  4.1768e-01,  ...,  9.9999e-01,
           9.2832e-04,  1.0000e+00],
         ...,
         [ 3.7961e-01, -9.2515e-01,  8.8807e-01,  ...,  9.7824e-01,
           4.5008e-02,  9.9899e-01],
         [-5.7338e-01, -8.1929e-01,  7.6926e-01,  ...,  9.7779e-01,
           4.5472e-02,  9.9897e-01],
         [-9.9921e-01,  3.9821e-02,  6.1489e-01,  ...,  9.7734e-01,
           4.5936e-02,  9.9894e-01]]))