In [None]:
import numpy as np
import matplotlib.pyplot as plt

def getPositionEncoding(seq_len, d, n=10000):
    P = np.zeros((seq_len, d))
    for k in range(seq_len):
        for i in np.arange(int(d/2)):
            denominator = np.power(n, 2*i/d)
            P[k, 2*i] = np.sin(k/denominator)
            P[k, 2*i+1] = np.cos(k/denominator)
    return P

P = getPositionEncoding(seq_len=4, d=4, n=100)
print(P)

def plotSinusoid(k, d=512, n=10000):
    x = np.arange(0, 100, 2)
    denominator = np.power(n, 2*x/d)
    y = np.sin(k/denominator)
    plt.plot(x, y)
    plt.title('k = ' + str(k))

fig = plt.figure(figsize=(15, 4))    
for i in range(4):
    plt.subplot(141 + i)
    plotSinusoid(i*4)

P = getPositionEncoding(seq_len=500, d=1024, n=10000)
cax = plt.matshow(P)
plt.gcf().colorbar(cax)

In [6]:
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
    def __init__(self, seq_len, d_model, dropout_prob=0.1):
        super(PositionalEncoding, self).__init__()
        """
        pe(pos, 2i) = sin(pos / 10000 ^ (2i / d_model))
        pe(pos, 2i + 1) = cos(pos / 10000 ^ (2i / d_model))
        """
        self.dropout = nn.Dropout(p=dropout_prob)
        
        P = torch.zeros((seq_len, d_model))  # Shape: (seq_len, d_model)

        position = torch.arange(0, seq_len).unsqueeze(1)  # Shape: (seq_len, 1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)   # Shape: (d_model / 2)
        )
        
        P[:, 0::2] = torch.sin(position * div_term)  # Shape: (seq_len, d_model)
        P[:, 1::2] = torch.cos(position * div_term)  # Shape: (seq_len, d_model)

        P = P.unsqueeze(0)  # Shape: (1, seq_len, d_model)
        self.register_buffer("pe", P)

    def forward(self, x):
        # Shape: x: (batch_size, seq_len, d_model)
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]  # Shape: (batch_size, seq_len, d_model)
        return self.dropout(x)

In [7]:
class Embedding(nn.Module):
    def __init__(self, vocab_dim, d_model):
        super(Embedding, self).__init__()

        self.emb = nn.Embedding(vocab_dim, d_model)
        self.d_model = d_model

    def forward(self, x):  
        # Shape: x: (batch_size, seq_len) - input token indices
        return self.emb(x) * math.sqrt(self.d_model)  # Shape: (batch_size, seq_len, d_model)

In [8]:
class EmbeddingWithPositionalEncoding(nn.Module):
    def __init__(self, vocab_dim, d_model, seq_len, dropout_prob=0.1):
        super(EmbeddingWithPositionalEncoding, self).__init__()

        self.embedding = Embedding(vocab_dim, d_model)
        self.positional_encoding = PositionalEncoding(seq_len, d_model, dropout_prob)

    def forward(self, x):
        # Shape: x: (batch_size, seq_len)
        x = self.embedding(x)  # Shape: (batch_size, seq_len, d_model)
        x = self.positional_encoding(x)  # Shape: (batch_size, seq_len, d_model)
        return x

In [9]:
vocab_dim = 50      # Vocabulary size (number of unique tokens)
d_model = 10        # Embedding dimension
seq_len = 30        # Maximum sequence length (for positional encoding)
dropout_prob = 0.1  # Dropout probability
batch_size = 2      # Batch size (number of sequences in a batch)
input_seq_len = 20  # Sequence length for input (actual input length)

# Initialize the combined model
model = EmbeddingWithPositionalEncoding(vocab_dim, d_model, seq_len, dropout_prob)

# Example 2D input vector: Shape (batch_size, input_seq_len)
input_tensor = torch.randint(0, vocab_dim, (batch_size, input_seq_len))  # Shape: (2, 20)

# Forward pass
output = model(input_tensor)
print(f"Input shape: {input_tensor.shape}")  # (2, 20)
print(f"Output shape: {output.shape}")       # (2, 20, 10)

Input shape: torch.Size([2, 20])
Output shape: torch.Size([2, 20, 10])


In [10]:
import copy

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

# clones(nn.Linear(32, 32), 4)

In [188]:
def dot_product_attention(query, key, value, mask=None, dropout=None):
    """
    Shapes: 
    query - (batch_size, seq_len_q, d_k)
    key - (batch_size, seq_len_kv, d_k)
    value - (batch_size, seq_len_kv, d_v)
    mask - (batch_size, seq_ken_q, seq_len_q)
    """
    d_k = math.sqrt(query.size(-1))

    scores = torch.matmul(query, key.transpose(-2, -1)) / d_k

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e-9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    
    return torch.matmul(p_attn, value), p_attn

# query = torch.rand(32, 128, 64)
# key = torch.rand(32, 128, 64)
# value = torch.rand(32, 128, 64)
# dot_product_attention(query, key, value)[0].size()

In [189]:
import gc

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        assert d_model % num_heads == 0, "Error, d_model should be divisible to num_heads"

        self.dim_head = d_model // num_heads
        self.h = num_heads
        self.linears = clones(nn.Linear(d_model, d_model), 4) # Q, K, V, Output linear
        self.dropout = nn.Dropout(p=dropout)
        self.attention = None

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)

        n_batches = query.size(0)

        query, key, value = [
            lin(x).view(n_batches, -1, self.h, self.dim_head).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        x, self.attention = dot_product_attention(
            query, key, value, mask, self.dropout
        )

        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(n_batches, -1, self.h * self.dim_head)
        )

        del query
        del key
        del value
        gc.collect()
        return self.linears[-1](x)

In [190]:
# multihead = MultiHeadAttention(d_model=512, num_heads=8)

# q = torch.rand((2, 100, 512))
# k = torch.rand((2, 200, 512))
# v = torch.rand((2, 200, 512))

# multihead.forward(q, k, v).size()

In [191]:
class PointWiseFeedForward(nn.Module):
    def __init__(self, d_model):
        super(PointWiseFeedForward, self).__init__()
        self.feed_forward = nn.Linear(in_features=d_model, out_features=d_model)
        self.norm = nn.LayerNorm(d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        feed_x = self.relu(self.feed_forward(x))
        # norm_x = self.norm(feed_x)
        return x + feed_x

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model: int, attention: MultiHeadAttention, feed_forward: PointWiseFeedForward, dropout: float = 0.1):
        super(EncoderBlock, self).__init__()

        self.d_model = d_model
        self.attention = attention
        self.feed_forward = feed_forward
        self.dropout = nn.Dropout(p=dropout)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x: torch.tensor, mask=None):
        z = self.norm(x)
        attention_out = self.dropout(self.attention(query=z, key=z, value=z, mask=mask)) + x
        feed_forward_out = self.feed_forward(attention_out)

        return self.norm(feed_forward_out + attention_out)

In [193]:
class Encoder(nn.Module):
    def __init__(self, d_model, seq_len, num_heads, dropout_prob, vocab_dim, num_blocks=2):
        super(Encoder, self).__init__()

        self.pos_embedding = EmbeddingWithPositionalEncoding(vocab_dim, d_model, seq_len, dropout_prob)

        self.blocks = clones(
            EncoderBlock(
                d_model, 
                MultiHeadAttention(d_model, num_heads, dropout_prob), 
                PointWiseFeedForward(d_model), dropout_prob
            ), 
            num_blocks
        )

    def forward(self, x, mask=None):
        out = self.pos_embedding(x)

        for block in self.blocks:
            out = block(out)
        
        return out
        

vocab_dim = 50      # Vocabulary size (number of unique tokens)
d_model = 64        # Embedding dimension
seq_len = 30        # Maximum sequence length (for positional encoding)
dropout_prob = 0.1  # Dropout probability
batch_size = 2      # Batch size (number of sequences in a batch)
input_seq_len = 20
num_heads = 4

enc = Encoder(d_model, seq_len, num_heads, dropout_prob, vocab_dim)

enc.forward(torch.randint(0, vocab_dim, (batch_size, seq_len))).size()

torch.Size([2, 30, 64])

In [194]:
# for masked multihead attention
def decoder_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [195]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout_prob, attention: MultiHeadAttention, feed_forward: PointWiseFeedForward, *args, **kwargs):
        super(DecoderBlock, self).__init__(*args, **kwargs)

        self.attention = attention
        self.enc_dec_attention = attention
        self.feed_forward = feed_forward

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, x: torch.tensor, in_memory: torch.tensor, enc_mask, dec_mask):
        z = self.norm1(x)
        attention_out = self.dropout(self.attention(query=z, key=z, value=z, mask=dec_mask)) + x

        z = self.norm2(attention_out)
        enc_dec_attention = self.dropout(self.enc_dec_attention(query=attention_out, key=in_memory, value=in_memory, mask=dec_mask)) + attention_out

        z = self.norm3(enc_dec_attention)
        feed_forward_out = self.feed_forward(z)

        return self.dropout(feed_forward_out) + enc_dec_attention

In [196]:
class Decoder(nn.Module):
    def __init__(self, d_model, seq_len, num_heads, dropout_prob, vocab_dim, num_blocks=2, *args, **kwargs):
        super(Decoder, self).__init__(*args, **kwargs)

        self.pos_embedding = EmbeddingWithPositionalEncoding(vocab_dim, d_model, seq_len, dropout_prob)

        self.blocks = clones(
            DecoderBlock(
                d_model, 
                num_heads=2,
                dropout_prob=0.1,
                attention=MultiHeadAttention(d_model, num_heads, dropout_prob), 
                feed_forward=PointWiseFeedForward(d_model)
            ), 
            num_blocks
        )

        self.norm = nn.LayerNorm(d_model)


    def forward(self, x, memory, enc_mask, dec_mask):
        out = self.pos_embedding(x)
        for block in self.blocks:
            x = block(out, memory, enc_mask, dec_mask)
        return self.norm(x)


In [197]:
# Define hyperparameters
vocab_dim = 50
d_model = 64
seq_len = 30
dropout_prob = 0.1
batch_size = 2
num_heads = 4
num_blocks = 2

# Initialize decoder
decoder = Decoder(d_model, seq_len, num_heads, dropout_prob, vocab_dim, num_blocks)

# Create dummy inputs
x = torch.randint(0, vocab_dim, (batch_size, seq_len))  # Input tokens
memory = torch.rand(batch_size, seq_len, d_model)      # Encoder output
enc_mask = torch.ones(batch_size, seq_len, seq_len)    # Encoder mask (all ones for simplicity)
dec_mask = decoder_mask(seq_len)                       # Decoder mask

# Forward pass
output = decoder(x, memory, enc_mask, dec_mask)
print(output.size())  # Expected: (batch_size, seq_len, d_model)


torch.Size([2, 30, 64])


In [198]:
class Generator(nn.Module):
    def __init__(self, vocab_dim, d_model, *args, **kwargs):
        super(Generator, self).__init__(*args, **kwargs)

        self.projection = nn.Linear(d_model, vocab_dim)

    def forward(self, x):
        return self.projection(x)

In [199]:
class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, vocab_dim, seq_len, num_encoder_layers, num_decoder_layers, dropout_prob, *args, **kwargs):
        super(Transformer, self).__init__(*args, **kwargs)

        self.encoder = Encoder(
            d_model=d_model,
            seq_len=seq_len,
            num_heads=num_heads,
            dropout_prob=dropout_prob,
            vocab_dim=vocab_dim,
            num_blocks=num_encoder_layers
        )

        self.decoder = Decoder(
            d_model=d_model,
            seq_len=seq_len,
            num_heads=num_heads,
            dropout_prob=dropout_prob,
            vocab_dim=vocab_dim,
            num_blocks=num_decoder_layers
        )

        self.generator = Generator(
            vocab_dim=vocab_dim,
            d_model=d_model
        )

    def forward(self, enc_input, dec_input, enc_mask, dec_mask):
        memory = self.encoder(enc_input)
        print(memory.size())

        decoder_out = self.decoder(dec_input, memory, enc_mask, dec_mask)
        print(decoder_out.size())

        out = self.generator(decoder_out)
        print(out.size())
        return out


In [200]:
# Define hyperparameters
vocab_dim = 50      # Vocabulary size (number of unique tokens)
d_model = 64        # Embedding dimension
seq_len = 30        # Maximum sequence length
dropout_prob = 0.1  # Dropout probability
batch_size = 2      # Batch size
num_heads = 4       # Number of attention heads
num_encoder_blocks = 2  # Number of encoder layers
num_decoder_blocks = 2  # Number of decoder layers

# Initialize Transformer model
transformer = Transformer(
    vocab_dim=vocab_dim,
    d_model=d_model,
    seq_len=seq_len,
    num_heads=num_heads,
    dropout_prob=dropout_prob,
    num_encoder_layers=num_encoder_blocks,
    num_decoder_layers=num_decoder_blocks,
)

# Create dummy data
src = torch.randint(0, vocab_dim, (batch_size, seq_len))  # Source input
tgt = torch.randint(0, vocab_dim, (batch_size, seq_len))  # Target input
src_mask = torch.ones(batch_size, seq_len, seq_len)      # Encoder mask
tgt_mask = decoder_mask(seq_len)                         # Decoder mask

# Forward pass
output = transformer(src, tgt, src_mask, tgt_mask)

# Output logits
print("Output shape:", output.size())  # Expected: (batch_size, seq_len, vocab_dim)


torch.Size([2, 30, 64])
torch.Size([2, 30, 64])
torch.Size([2, 30, 50])
Output shape: torch.Size([2, 30, 50])


In [201]:
def test_multihead_attention():
    batch_size, seq_len, d_model, num_heads = 2, 30, 64, 4
    mha = MultiHeadAttention(d_model, num_heads)
    query = key = value = torch.rand(batch_size, seq_len, d_model)
    mask = torch.ones(batch_size, seq_len, seq_len)
    out = mha(query, key, value, mask)
    assert out.size() == (batch_size, seq_len, d_model), "Output shape mismatch"


In [202]:
# try to make one forward pass

# Create dummy data
src = torch.randint(0, vocab_dim, (batch_size, seq_len))  # Source input
tgt = torch.randint(0, vocab_dim, (batch_size, seq_len))  # Target input
src_mask = torch.ones(batch_size, seq_len, seq_len)      # Encoder mask
tgt_mask = decoder_mask(seq_len)                         # Decoder mask

# Forward pass
transformer = Transformer(
    vocab_dim=vocab_dim,
    d_model=d_model,
    seq_len=seq_len,
    num_heads=num_heads,
    dropout_prob=dropout_prob,
    num_encoder_layers=num_encoder_blocks,
    num_decoder_layers=num_decoder_blocks,
)

In [203]:
def decoder_mask(size):
    # Generates a causal mask for the decoder
    mask = torch.triu(torch.ones(size, size), diagonal=1).type(torch.bool)
    return ~mask  # Invert: True = allowed, False = blocked


In [204]:
# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)

# Generate synthetic data (identity mapping)
num_batches = 100
for batch in range(num_batches):
    src = torch.randint(0, vocab_dim, (batch_size, seq_len))  # Source input
    tgt = src.clone()  # Target output (identity mapping)
    tgt_input = tgt[:, :-1]  # Remove the last token for input
    tgt_output = tgt[:, 1:]  # Remove the first token for target

    # Masks
    src_mask = torch.ones(batch_size, seq_len, seq_len)
    tgt_mask = decoder_mask(seq_len - 1)

    # Forward pass
    output = transformer(src, tgt_input, src_mask, tgt_mask)

    # Reshape output and target for loss calculation
    loss = criterion(output.view(-1, vocab_dim), tgt_output.view(-1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Batch {batch+1}/{num_batches}, Loss: {loss.item():.4f}")


torch.Size([2, 30, 64])


RuntimeError: The size of tensor a (29) must match the size of tensor b (4) at non-singleton dimension 1