In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [19]:
# Define the scaled dot-product attention
def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    attention_weights = torch.nn.functional.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, value)
    return output, attention_weights

# Define the multi-head attention module
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        def split_heads(x):
            return x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        query = split_heads(self.query(query))
        key = split_heads(self.key(key))
        value = split_heads(self.value(value))

        attention_output, _ = scaled_dot_product_attention(query, key, value, mask)
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.out(attention_output)

In [20]:
# Define the feedforward network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

In [21]:
# Define positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [22]:
# Define the Transformer block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.dropout(self.attention(x, x, x, mask))
        x = self.norm1(x + attn_output)
        ff_output = self.dropout(self.feed_forward(x))
        x = self.norm2(x + ff_output)
        return x

In [23]:
# Define the Transformer model
class Transformer(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, d_model, num_heads, d_ff, num_layers, max_len=100, dropout=0.1):
        super().__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, target_vocab_size)

    def forward(self, src, trg, src_mask=None, trg_mask=None):
        # Encoder
        src = self.encoder_embedding(src)
        src = self.positional_encoding(src)
        for layer in self.encoder_layers:
            src = layer(src, src_mask)

        # Decoder
        trg = self.decoder_embedding(trg)
        trg = self.positional_encoding(trg)
        for layer in self.decoder_layers:
            trg = layer(trg, trg_mask)

        # Output layer
        return self.fc_out(trg)

In [24]:
# Define hyperparameters
INPUT_VOCAB_SIZE = tokenizer.vocab_size  # Use the tokenizer's vocabulary size
TARGET_VOCAB_SIZE = tokenizer.vocab_size # Use the tokenizer's vocabulary size
D_MODEL = 512
NUM_HEADS = 8
D_FF = 2048
NUM_LAYERS = 6
MAX_LEN = 100

# Instantiate the model with the updated vocabulary size
model = Transformer(INPUT_VOCAB_SIZE, TARGET_VOCAB_SIZE, D_MODEL, NUM_HEADS, D_FF, NUM_LAYERS, MAX_LEN)

In [25]:

# Example input (batch of tokenized sequences)
src = torch.randint(0, INPUT_VOCAB_SIZE, (32, 10))  # Source batch (batch_size=32, seq_len=10)
trg = torch.randint(0, TARGET_VOCAB_SIZE, (32, 10))  # Target batch (batch_size=32, seq_len=10)


In [26]:

# Forward pass
output = model(src, trg)
print(output.shape)  # Should be (batch_size, seq_len, target_vocab_size)


torch.Size([32, 10, 50257])


In [27]:
# Training and Testing

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
def train_model(model, src_data, trg_data, epochs=10):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(src_data, trg_data[:, :-1])
        output = output.reshape(-1, output.shape[-1])
        trg = trg_data[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

In [28]:
# Generate dummy data
src_data = torch.randint(0, INPUT_VOCAB_SIZE, (32, 20))
trg_data = torch.randint(0, TARGET_VOCAB_SIZE, (32, 20))

# Train the model
train_model(model, src_data, trg_data, epochs=5)

Epoch 1/5, Loss: 10.9932
Epoch 2/5, Loss: 10.6207
Epoch 3/5, Loss: 10.3270
Epoch 4/5, Loss: 10.0739
Epoch 5/5, Loss: 9.8452


In [29]:
# Simple test for translation
def translate(model, src_sentence):
    model.eval()
    with torch.no_grad():
        src = torch.tensor(src_sentence).unsqueeze(0)
        trg = torch.zeros((1, MAX_LEN), dtype=torch.long)
        for i in range(1, MAX_LEN):
            output = model(src, trg[:, :i])
            next_word = output.argmax(dim=-1)[:, -1]
            trg[0, i] = next_word
            if next_word.item() == 1:  # Assuming 1 is the <eos> token
                break
        return trg.squeeze().tolist()

In [30]:
# Test translation
src_sentence = [5, 20, 30, 40, 50, 60, 70]  # Example tokenized source sentence
translation = translate(model, src_sentence)
print("Translated Sentence:", translation)


Translated Sentence: [0, 4241, 1693, 25313, 42242, 20346, 11910, 5752, 25313, 25313, 25313, 25313, 25313, 25313, 25313, 25313, 43823, 34602, 36845, 18472, 30890, 13507, 37825, 11602, 29366, 20346, 7740, 35066, 21591, 45528, 18472, 34654, 14232, 24051, 24494, 18706, 48922, 28370, 48922, 34028, 18472, 3412, 15690, 23014, 44888, 17206, 41055, 38348, 40887, 226, 38677, 20121, 35410, 9313, 11374, 35410, 9313, 11374, 7264, 8114, 8323, 8165, 35181, 34602, 40360, 6353, 25542, 25313, 30778, 25745, 48922, 12449, 17953, 34809, 584, 13487, 34602, 25745, 41115, 6990, 8868, 37208, 27739, 45528, 16341, 35657, 31342, 35657, 37722, 621, 24803, 31342, 35657, 23187, 16308, 5421, 23187, 35657, 14776, 36837]


In [31]:
from transformers import AutoTokenizer

# Use a pre-trained tokenizer (e.g., GPT-2 tokenizer)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [32]:
# Example sentences
source_sentence = "hello world"
target_sentence = "bonjour le monde"

# Tokenize and encode the sentences
src_tokens = tokenizer.encode(source_sentence, return_tensors="pt")
trg_tokens = tokenizer.encode(target_sentence, return_tensors="pt")

In [33]:
# Training example
train_model(model, src_tokens.repeat(32, 1), trg_tokens.repeat(32, 1), epochs=5)

# Translate a sentence
translated_tokens = translate(model, src_tokens.squeeze().tolist())
translated_sentence = tokenizer.decode(translated_tokens, skip_special_tokens=True)



Epoch 1/5, Loss: 11.1919
Epoch 2/5, Loss: 9.4892
Epoch 3/5, Loss: 7.5151
Epoch 4/5, Loss: 5.8527
Epoch 5/5, Loss: 4.5456


In [34]:
print("Source Sentence:", source_sentence)
print("Translated Sentence:", translated_sentence)

Source Sentence: hello world
Translated Sentence: !our le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeondeour le mondeondeondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le mondeour le monde
