In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import sacrebleu

In [3]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM

In [4]:
from datasets import load_dataset

In [85]:
from tqdm.notebook import tqdm, trange

In [50]:
# Check if CUDA is available
assert torch.cuda.device_count() > 1, "This script requires at least 2 GPUs"

In [54]:
device_ids = [i for i in range(torch.cuda.device_count())]

# Examine pretrained BERT model

In [62]:
tokenizer = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_de_en", pad_token="<pad>", eos_token="</s>", bos_token="<s>")
model = AutoModelForSeq2SeqLM.from_pretrained("google/bert2bert_L-24_wmt_de_en")



In [8]:
de_en_ds_train = load_dataset("stas/wmt14-en-de-pre-processed", split="train")

In [63]:
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to('cuda')

In [10]:
de_en_ds_valid = load_dataset("stas/wmt14-en-de-pre-processed", split="validation")

In [11]:
de_en_ds_test = load_dataset("stas/wmt14-en-de-pre-processed", split="test")

In [12]:
de_en_ds = {'train':de_en_ds_train, 'validation':de_en_ds_valid, 'test':de_en_ds_test}

In [13]:
de_en_ds_train[0]

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode',
  'en': 'Resumption of the session'}}

In [14]:
source_lang = "de"
target_lang = "en"

In [15]:
de_sentence = de_en_ds['validation'][0]['translation']['de']
en_sentence = de_en_ds['validation'][0]['translation']['en']

## Forward Translation

In [64]:
input_ids = tokenizer(de_sentence, return_tensors="pt", add_special_tokens=False).input_ids

In [91]:
print(input_ids)

tensor([[ 1094, 21755, 10396,  1091, 19586,  1099,  1097,  2041, 19660,  1099,
         25818,  1133,  1091,  1096,  4103,  1241,   119]])


In [65]:
output_ids = model.module.generate(input_ids.to('cuda'))[0]

In [70]:
translate_sentence = tokenizer.decode(output_ids, skip_special_tokens=True)

In [71]:
print("target sentence:", en_sentence, "translated sentence:", translate_sentence)

target sentence: India and Japan prime ministers meet in Tokyo translated sentence: The prime ministers of India and Japan met in tokio.


## Evaluate the Pretrained Model Translation Quality

In [73]:
def translate(texts, tokenizer, model):
    tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=24, add_special_tokens=False).input_ids
    translated_tokens = model.module.generate(tokens.to('cuda'))
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

In [74]:
# Sample data
src_texts = [
    "Das ist ein Test.",
    "Wie geht es dir?",
    "Ich liebe das Lernen neuer Sprachen."
]
ref_texts = [
    "This is a test.",
    "How are you?",
    "I love learning new languages."
]

In [75]:
pred_texts = translate(src_texts, tokenizer, model)

In [76]:
bleu = sacrebleu.corpus_bleu(pred_texts, [ref_texts])
print(f"BLEU score: {bleu.score}")

BLEU score: 26.604926507725


In [77]:
pred_texts

['that is a test.',
 'How does it work for you?',
 'I love learning to speak more.']

In [86]:
def evaluate_bleu(validation_dataset, tokenizer, model, batch_size=32):
    src_texts = [entry['translation']['de'] for entry in validation_dataset]
    ref_texts = [entry['translation']['en'] for entry in validation_dataset]

    num_batches = len(src_texts) // batch_size + int(len(src_texts) % batch_size > 0)
    all_pred_texts = []
    all_ref_texts = []

    for i in trange(num_batches, desc="Processing Batches"):
        batch_src_texts = src_texts[i * batch_size:(i + 1) * batch_size]
        batch_ref_texts = ref_texts[i * batch_size:(i + 1) * batch_size]
        
        # Translate the current minibatch
        batch_pred_texts = translate(batch_src_texts, tokenizer, model)
        
        # Store predictions and references
        all_pred_texts.extend(batch_pred_texts)
        all_ref_texts.extend(batch_ref_texts)
        
    bleu = sacrebleu.corpus_bleu(all_pred_texts, [all_ref_texts])
    return bleu

In [88]:
with torch.no_grad():
    bleu = evaluate_bleu(de_en_ds_valid, tokenizer, model)
print(f"BLEU score: {bleu.score}")

Processing Batches:   0%|          | 0/68 [00:00<?, ?it/s]

BLEU score: 12.70440937875939


In [89]:
torch.cuda.empty_cache()

In [90]:
model = model.to('cpu')

# Create Customized Transformer Model

In [96]:
import math

In [93]:
vocab_size = len(tokenizer.get_vocab())

In [123]:
batch_size = 8
d_model = 512
n_head = 8
max_len = 1000

In [120]:
embedding = torch.rand(batch_size, max_len, d_model)

In [116]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEmbedding, self).__init__()
        # b, t, d
        self.encoding = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(-torch.arange(0, d_model, 2).float() * (math.log(10000.0) / d_model))

        self.encoding[:, 0::2] = torch.sin(pos * div)
        self.encoding[:, 1::2] = torch.cos(pos * div)
        self.encoding = self.encoding.unsqueeze(0)
    
    def forward(self, x):
        return x + self.encoding[:, :x.size(1)]

In [117]:
pe = PositionalEmbedding(d_model, max_len)

In [121]:
pe(embedding).size()

torch.Size([8, 1000, 512])

In [125]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_head == 0, "The dimension of model must be divisible by the number of heads"
        
        self.n_head = n_head
        self.d_model = d_model
        self.d_head = d_model // n_head
        
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.proj = nn.Linear(d_model, d_model)

    def split_into_heads(self, x, batch_size):
        x = x.contiguous().view(batch_size, -1, self.n_head, self.d_head)
        return x.permute(0, 2, 1, 3)
        
    def forward(self, q, k, v, mask=None):
        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wk(v)
        batch_size = q.size(0)

        q = self.split_into_heads(q, batch_size)
        k = self.split_into_heads(k, batch_size)
        v = self.split_into_heads(v, batch_size)

        score = torch.matmul(self.q, self.k.transpose(-2, -1)) / math.sqrt(self.d_head)
        
        if mask:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attention = F.softmax(scores, dim=-1)

        output = torch.matmul(attention, v)
        output = output.transpose(1, 2)
        output = output.contiguous.view(batch_size, -1, self.d_model)
            
        output = self.proj(output)
        return output

In [126]:
mla = MultiHeadAttention(d_model, n_head)

In [127]:
mla(torch.rand(batch_size, max_len, d_model)).size()

TypeError: MultiHeadAttention.forward() missing 2 required positional arguments: 'k' and 'v'

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model):
        super(FeedForward, self).__init__()
    def forward(self, x):
        pass

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, n_head, d_model):
        super(EncoderBlock, self).__init__()
    def forward(self, x):
        pass

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, n_head, d_model):
        super(TransformerEncoder, self).__init__()
    def forward(self, x):
        pass

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, n_head, d_model):
        super(TransformerDecoder, self).__init__()
    def forward(self, x):
        pass

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, max_len):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEmbedding(d_model, max_len)
        self.encoder = TransformerEncoder()
        self.decoder = TransformerDecoder()

    def forward(self, x, mask):
        pass