# Transformer from Scratch - Elias Hossain  
### Graduate Student at Mississippi State University  

This notebook implements a Transformer model from scratch, inspired by the **"Attention Is All You Need"** paper.  
We will follow these key steps:

1. **Prepare Dummy Data**
2. **Tokenization & Embeddings**
3. **Positional Encoding**
4. **Multi-Head Self-Attention**
5. **Feed-Forward Network**
6. **Stack Transformer Blocks**
7. **Training with Loss and Accuracy Computation**
---

### Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np



### Step 1: Create Dummy Data

In [13]:
SRC_VOCAB_SIZE = 10  # Small vocabulary for the source language
TGT_VOCAB_SIZE = 10  # Small vocabulary for the target language
MAX_LEN = 5  # Max length of input/output sequence
EMBED_DIM = 8  # Embedding dimension
BATCH_SIZE = 2  # Small batch size

# Dummy input (random integers representing words in vocab)
source_data = torch.randint(0, SRC_VOCAB_SIZE, (BATCH_SIZE, MAX_LEN))
target_data = torch.randint(0, TGT_VOCAB_SIZE, (BATCH_SIZE, MAX_LEN))

print("Source Data:", source_data)
print("Target Data:", target_data)

Source Data: tensor([[6, 9, 6, 2, 8],
        [5, 8, 9, 4, 4]])
Target Data: tensor([[6, 6, 3, 9, 6],
        [7, 5, 5, 4, 0]])


### Step 2: Token Embeddings

In [14]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    
    def forward(self, x):
        return self.embedding(x)

source_embedding = TokenEmbedding(SRC_VOCAB_SIZE, EMBED_DIM)
target_embedding = TokenEmbedding(TGT_VOCAB_SIZE, EMBED_DIM)

source_embedded = source_embedding(source_data)
target_embedded = target_embedding(target_data)
print("Token Embedding Shape:", source_embedded.shape)  # Should be (BATCH_SIZE, MAX_LEN, EMBED_DIM)

Token Embedding Shape: torch.Size([2, 5, 8])


### Step 3: Positional Encoding

In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=MAX_LEN):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * -(np.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

pos_encoder = PositionalEncoding(EMBED_DIM)
source_encoded = pos_encoder(source_embedded)
target_encoded = pos_encoder(target_embedded)
print("Positional Encoding Added:", source_encoded.shape)


Positional Encoding Added: torch.Size([2, 5, 8])


### Step 4: Multi-Head Self-Attention

In [21]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3)
        self.fc_out = nn.Linear(embed_dim, embed_dim)
        self.scale = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
    
    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv_proj(x).chunk(3, dim=-1)  # Split into Q, K, V
        Q, K, V = [t.view(B, T, self.num_heads, self.head_dim).transpose(1, 2) for t in qkv]
        
        scores = (Q @ K.transpose(-2, -1)) / self.scale
        attn = torch.softmax(scores, dim=-1)
        out = (attn @ V).transpose(1, 2).reshape(B, T, C)
        return self.fc_out(out)

### Step 5: Feed-Forward Network

In [22]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embed_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

### Step 6: Transformer Model with Output Projection

In [23]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_dim):
        super().__init__()
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ffn = FeedForwardNetwork(embed_dim, hidden_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
    
    def forward(self, x):
        attn_out = self.attn(x)
        x = self.norm1(x + attn_out)  # Residual Connection
        ffn_out = self.ffn(x)
        return self.norm2(x + ffn_out)  # Residual Connection

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, num_heads, hidden_dim, num_layers):
        super().__init__()
        self.src_embed = TokenEmbedding(src_vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim)
        self.encoder = nn.ModuleList([TransformerBlock(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)])
        self.fc_out = nn.Linear(embed_dim, tgt_vocab_size)  # Output projection layer
    
    def forward(self, src):
        src = self.pos_enc(self.src_embed(src))
        for layer in self.encoder:
            src = layer(src)
        return self.fc_out(src)  # Project to target vocab size

transformer = Transformer(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, EMBED_DIM, num_heads=2, hidden_dim=32, num_layers=2)
optimizer = optim.Adam(transformer.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

### Training Step

In [24]:
num_epochs = 10
for epoch in range(num_epochs):
    output = transformer(source_data)
    target_labels = torch.randint(0, TGT_VOCAB_SIZE, (BATCH_SIZE, MAX_LEN))
    loss = criterion(output.view(-1, TGT_VOCAB_SIZE), target_labels.view(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Calculate accuracy
    predictions = torch.argmax(output, dim=-1)
    correct = (predictions == target_labels).sum().item()
    total = target_labels.numel()
    accuracy = correct / total * 100
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%")

Epoch 1/10, Loss: 2.8160, Accuracy: 0.00%
Epoch 2/10, Loss: 2.7852, Accuracy: 0.00%
Epoch 3/10, Loss: 2.5829, Accuracy: 0.00%
Epoch 4/10, Loss: 2.6864, Accuracy: 0.00%
Epoch 5/10, Loss: 2.1746, Accuracy: 20.00%
Epoch 6/10, Loss: 2.5760, Accuracy: 10.00%
Epoch 7/10, Loss: 2.3655, Accuracy: 20.00%
Epoch 8/10, Loss: 2.9434, Accuracy: 0.00%
Epoch 9/10, Loss: 2.7594, Accuracy: 10.00%
Epoch 10/10, Loss: 2.8197, Accuracy: 0.00%
