In [1]:
import torch
import torch.nn as nn
from tqdm import tqdm
import os
from tokenizers import ByteLevelBPETokenizer

import components
import utils

In [2]:
# Load the trained tokenizer
tokenizer = ByteLevelBPETokenizer(
    "bpe_tokenizer/vocab.json",
    "bpe_tokenizer/merges.txt"
)

In [3]:
# initialize model
d_model = 512  # Model dimension
num_heads = 8  # Number of attention heads
num_encoder_layers = 6  # Number of encoder layers
num_decoder_layers = 6  # Number of decoder layers
d_ff = 2048  # Dimension of feedforward layers
dropout = 0.1  # Dropout rate

vocab_size = tokenizer.get_vocab_size()  # Vocabulary size from your tokenizer

# Initialize the encoder, decoder, and the full model
encoder = components.Encoder(num_encoder_layers, num_heads, d_model, d_ff, dropout)
decoder = components.Decoder(num_decoder_layers, num_heads, d_model, d_ff, dropout)
src_embed = nn.Sequential(nn.Embedding(vocab_size, d_model), components.PositionalEncoding(d_model, dropout))
tgt_embed = nn.Sequential(nn.Embedding(vocab_size, d_model), components.PositionalEncoding(d_model, dropout))
generator = components.Generator(d_model, vocab_size)

# Initialize the EncoderDecoder model
model = components.EncoderDecoder(encoder, decoder, src_embed, tgt_embed, generator)

In [4]:
# 1. Tokenize the German and English sentences
src_sentences = ["<s> Hier ist ein Beispiel </s>", "<s> Dieses ist auch ein deutsches Beispiel. </s>"]
tgt_sentences = ["<s> Here is an example. </s>", "<s> This is also a German example. </s>"]

In [5]:
# Tokenize the german and english sentences

src_tokenized = tokenizer.encode_batch(src_sentences)
print([x.ids for x in src_tokenized])

tgt_tokenized = tokenizer.encode_batch(tgt_sentences)
print([x.ids for x in tgt_tokenized])

[[32, 87, 34, 8718, 423, 328, 3010, 225, 32, 19, 87, 34], [32, 87, 34, 15908, 423, 561, 328, 14451, 268, 3010, 18, 225, 32, 19, 87, 34]]
[[32, 87, 34, 21873, 326, 293, 2222, 18, 225, 32, 19, 87, 34], [32, 87, 34, 4132, 326, 668, 264, 3084, 2222, 18, 225, 32, 19, 87, 34]]


In [6]:
# get maximum lengths and pad
src_max_len = max(len(ids) for ids in src_tokenized)
tgt_max_len = max(len(ids) for ids in tgt_tokenized)

pad_token_id = tokenizer.token_to_id('<pad>')

src_token_ids = torch.tensor([ids.ids + [pad_token_id] * (src_max_len - len(ids)) for ids in src_tokenized])
tgt_token_ids = torch.tensor([ids.ids + [pad_token_id] * (tgt_max_len - len(ids)) for ids in tgt_tokenized])

In [7]:
print("Padded Source Token IDs:", src_token_ids.shape)
print("Padded Target Token IDs:", tgt_token_ids.shape)


Padded Source Token IDs: torch.Size([2, 16])
Padded Target Token IDs: torch.Size([2, 15])


In [8]:
# 3. embed the source via the dense embedding and the positional encoding. 
src_embed_output = model.src_embed(src_token_ids)
src_embed_output.shape # bs, src_max_len, d_model

x shape before PositionalEncoding: torch.Size([2, 16, 512])
x shape after PositionalEncoding: torch.Size([2, 16, 512])


torch.Size([2, 16, 512])

In [10]:
# 6. Create the source mask
src_mask = (src_token_ids != pad_token_id).unsqueeze(-2) # bs, 1, src_max_len but why though???
src_mask.shape

torch.Size([2, 1, 16])

In [11]:
# 7. Pass the source embedding through the encoder
enc_output = model.encoder(src_embed_output, src_mask)
enc_output.shape # bs, src_max_len, d_model

Input to Encoder, x shape: torch.Size([2, 16, 512]), mask shape: torch.Size([2, 1, 16])
Input to EncoderLayer, x shape: torch.Size([2, 16, 512]), mask shape: torch.Size([2, 1, 16])
query shape before linear: torch.Size([2, 16, 512])
query/key/value shape after linear: torch.Size([2, 16, 512]), torch.Size([2, 16, 512]), torch.Size([2, 16, 512])
query/key/value shape after view and transpose: torch.Size([2, 8, 16, 64]), torch.Size([2, 8, 16, 64]), torch.Size([2, 8, 16, 64])
query shape: torch.Size([2, 8, 16, 64]), key shape: torch.Size([2, 8, 16, 64]), value shape: torch.Size([2, 8, 16, 64])
scores shape after matmul: torch.Size([2, 8, 16, 16])
mask shape: torch.Size([2, 1, 16])


RuntimeError: The size of tensor a (2) must match the size of tensor b (8) at non-singleton dimension 1

In [None]:
# 8.  For the target, shift the target token IDS to create the input for the decoder
tgt_input = tgt_token_ids[:, :-1]
tgt_input.shape # bs, tgt_max_len - 1

In [None]:
# 9. Add positional encoding and embed the shifted target_sequences
tgt_embed_output = model.tgt_embed(tgt_input)
tgt_embed_output.shape # bs, max_tgt_len - 1, d_model

In [None]:
# 10. Create the target mask, including the look-ahead mask
def make_std_mask(tgt, pad):
    "Create a mask to hide padding and future words."
    #print("Target (tgt):", tgt)
    
    # Padding mask
    tgt_padding_mask = (tgt != pad).unsqueeze(1).unsqueeze(2)
    #print("Padding Mask:", tgt_padding_mask)
    
    # Look-ahead mask (subsequent mask)
    tgt_seq_len = tgt.size(-1)
    look_ahead_mask = torch.triu(torch.ones((1, tgt_seq_len, tgt_seq_len), device=tgt.device), diagonal=1).type_as(tgt_padding_mask.data)
    #print("Look-Ahead Mask (Subsequent Mask):", look_ahead_mask)
    
    # Combined mask
    tgt_mask = tgt_padding_mask & (look_ahead_mask == 0)
    #print("Combined Target Mask:", tgt_mask)
    
    return tgt_mask

tgt_mask = make_std_mask(tgt_input, pad_token_id)
tgt_mask.shape # bs, tgt_max_len - 1, tgt_max_len - 1

In [None]:
# 11. Pass the target embedding through the decoder
dec_output = model.decoder(tgt_embed_output, enc_output, src_mask, tgt_mask)
dec_output.shape # bs, tgt_max_len - 1, d_model

In [None]:
# 12. Pass the decoder output through the generator to get logits over the vocabulary
logits = model.generator(dec_output)
logits.shape # bs, tgt_max_len - 1, vocab_size

In [None]:
# 13. The target sequence used for calculating the loss is the actual sequence shifted by one
tgt_y = tgt_token_ids[:, 1:]
tgt_y.shape

In [None]:
# 14. Calculate the loss
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('<pad>'))
loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_y.reshape(-1))

loss

In [None]:
# 15. Obtain the predicted tokens
predicted_token_ids = torch.argmax(logits, dim=-1)
predicted_token_ids

In [None]:
# 16. Convert the predicted tokens into a string
predicted_sentences = []
for token_ids in predicted_token_ids:
    # Remove special tokens if they exist
    token_ids = token_ids.cpu().numpy()
    token_ids = token_ids.tolist()

    # Convert token IDs to words
    sentence = tokenizer.decode(token_ids, skip_special_tokens=True)
    predicted_sentences.append(sentence)

print("Predicted Sentences:", predicted_sentences)
