In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import sacrebleu

In [3]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM

In [4]:
from datasets import load_dataset

In [5]:
from tqdm.notebook import tqdm, trange

In [6]:
# Check if CUDA is available
assert torch.cuda.device_count() > 1, "This script requires at least 2 GPUs"

In [7]:
device_ids = [i for i in range(torch.cuda.device_count())]

# Examine pretrained BERT model

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_de_en", pad_token="<pad>", eos_token="</s>", bos_token="<s>")
model = AutoModelForSeq2SeqLM.from_pretrained("google/bert2bert_L-24_wmt_de_en")



In [9]:
de_en_ds_train = load_dataset("stas/wmt14-en-de-pre-processed", split="train")

In [10]:
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to('cuda')

In [11]:
de_en_ds_valid = load_dataset("stas/wmt14-en-de-pre-processed", split="validation")

In [12]:
de_en_ds_test = load_dataset("stas/wmt14-en-de-pre-processed", split="test")

In [13]:
de_en_ds = {'train':de_en_ds_train, 'validation':de_en_ds_valid, 'test':de_en_ds_test}

In [14]:
de_en_ds_train[0]

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode',
  'en': 'Resumption of the session'}}

In [15]:
source_lang = "de"
target_lang = "en"

In [16]:
de_sentence = de_en_ds['validation'][0]['translation']['de']
en_sentence = de_en_ds['validation'][0]['translation']['en']

## Forward Translation

In [17]:
input_ids = tokenizer(de_sentence, return_tensors="pt", add_special_tokens=False).input_ids

In [18]:
print(input_ids)

tensor([[ 1094, 21755, 10396,  1091, 19586,  1099,  1097,  2041, 19660,  1099,
         25818,  1133,  1091,  1096,  4103,  1241,   119]])


In [19]:
output_ids = model.module.generate(input_ids.to('cuda'))[0]

In [20]:
translate_sentence = tokenizer.decode(output_ids, skip_special_tokens=True)

In [21]:
print("target sentence:", en_sentence, "translated sentence:", translate_sentence)

target sentence: India and Japan prime ministers meet in Tokyo translated sentence: The prime ministers of India and Japan met in tokio.


## Evaluate the Pretrained Model Translation Quality

In [22]:
def translate(texts, tokenizer, model):
    tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=24, add_special_tokens=False).input_ids
    translated_tokens = model.module.generate(tokens.to('cuda'))
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

In [74]:
# Sample data
src_texts = [
    "Das ist ein Test.",
    "Wie geht es dir?",
    "Ich liebe das Lernen neuer Sprachen."
]
ref_texts = [
    "This is a test.",
    "How are you?",
    "I love learning new languages."
]

In [75]:
pred_texts = translate(src_texts, tokenizer, model)

In [76]:
bleu = sacrebleu.corpus_bleu(pred_texts, [ref_texts])
print(f"BLEU score: {bleu.score}")

BLEU score: 26.604926507725


In [77]:
pred_texts

['that is a test.',
 'How does it work for you?',
 'I love learning to speak more.']

In [23]:
def evaluate_bleu(validation_dataset, tokenizer, model, batch_size=32):
    src_texts = [entry['translation']['de'] for entry in validation_dataset]
    ref_texts = [entry['translation']['en'] for entry in validation_dataset]

    num_batches = len(src_texts) // batch_size + int(len(src_texts) % batch_size > 0)
    all_pred_texts = []
    all_ref_texts = []

    for i in trange(num_batches, desc="Processing Batches"):
        batch_src_texts = src_texts[i * batch_size:(i + 1) * batch_size]
        batch_ref_texts = ref_texts[i * batch_size:(i + 1) * batch_size]
        
        # Translate the current minibatch
        batch_pred_texts = translate(batch_src_texts, tokenizer, model)
        
        # Store predictions and references
        all_pred_texts.extend(batch_pred_texts)
        all_ref_texts.extend(batch_ref_texts)
        
    bleu = sacrebleu.corpus_bleu(all_pred_texts, [all_ref_texts])
    return bleu

In [88]:
with torch.no_grad():
    bleu = evaluate_bleu(de_en_ds_valid, tokenizer, model)
print(f"BLEU score: {bleu.score}")

Processing Batches:   0%|          | 0/68 [00:00<?, ?it/s]

BLEU score: 12.70440937875939


In [24]:
torch.cuda.empty_cache()

In [25]:
model = model.to('cpu')

In [26]:
del model

# Create Customized Transformer Model

In [27]:
import math

In [451]:
en_tokenizer = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_de_en", pad_token="<pad>", eos_token="</s>", bos_token="<s>", unk_token="<unk>")
de_tokenizer = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_en_de", pad_token="<pad>", eos_token="</s>", bos_token="<s>", unk_token="<unk>")



In [452]:
vocab_size = len(tokenizer.get_vocab())

In [464]:
en_vocab_size = len(en_tokenizer.get_vocab())
de_vocab_size = len(de_tokenizer.get_vocab())
batch_size = 128
d_model = 512
n_head = 4
max_len = 32
dropout = 0.1
n_enc_layer = 2
n_dec_layer = 2

In [465]:
embedding = torch.rand(batch_size, max_len, d_model)

In [466]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEmbedding, self).__init__()
        # b, t, d
        self.encoding = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(-torch.arange(0, d_model, 2).float() * (math.log(10000.0) / d_model))

        self.encoding[:, 0::2] = torch.sin(pos * div)
        self.encoding[:, 1::2] = torch.cos(pos * div)
        self.encoding = self.encoding.unsqueeze(0)
    
    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].to(x.device)

In [467]:
pe = PositionalEmbedding(d_model, max_len)

In [468]:
pe(embedding).size()

torch.Size([128, 32, 512])

In [469]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_head == 0, "The dimension of model must be divisible by the number of heads"
        
        self.n_head = n_head
        self.d_model = d_model
        self.d_head = d_model // n_head
        
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.proj = nn.Linear(d_model, d_model)

    def split_into_heads(self, x, batch_size):
        x = x.contiguous().view(batch_size, -1, self.n_head, self.d_head)
        return x.permute(0, 2, 1, 3)
        
    def forward(self, v, k, q, mask=None):
        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wk(v)
        batch_size = q.size(0)

        q = self.split_into_heads(q, batch_size)
        k = self.split_into_heads(k, batch_size)
        v = self.split_into_heads(v, batch_size)

        attention = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head)
        
        if mask is not None:
            attention = attention.masked_fill(mask, float('-inf'))
        attention = F.softmax(attention, dim=-1)

        output = torch.matmul(attention, v)
        output = output.transpose(1, 2)
        output = output.contiguous().view(batch_size, -1, self.d_model)
            
        output = self.proj(output)
        return output

In [470]:
mla = MultiHeadAttention(d_model, n_head)

In [471]:
mla(embedding, embedding, embedding).size()

torch.Size([128, 32, 512])

In [472]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_model * 2)
        self.linear2 = nn.Linear(d_model * 2, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [473]:
ff = FeedForward(d_model, dropout)

In [474]:
ff(embedding).size()

torch.Size([128, 32, 512])

In [475]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, n_head, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.mla = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.ff = FeedForward(d_model, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, src, src_mask=None):
        attn_output = self.mla(src, src, src, src_mask)
        src = src + self.dropout1(attn_output)
        src = self.norm1(src)

        ff_output = self.ff(src)
        src = src + self.dropout2(ff_output)
        src = self.norm2(src)
        return src

In [476]:
eb = EncoderBlock(d_model, n_head, dropout)

In [477]:
eb(embedding).size()

torch.Size([128, 32, 512])

In [478]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_head, dropout=0.1):
        super(DecoderBlock, self).__init__()
        self.mla1 = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.mla2 = MultiHeadAttention(d_model, n_head)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
        self.ff = FeedForward(d_model, dropout)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(dropout)
    
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        tgt_attn_output = self.mla1(tgt, tgt, tgt, tgt_mask)
        tgt = tgt + self.dropout1(tgt_attn_output)
        tgt = self.norm1(tgt)

        attn_output = self.mla2(memory, memory, tgt, memory_mask)
        tgt = tgt + self.dropout2(attn_output)
        tgt = self.norm2(tgt)

        ff_output = self.ff(tgt)
        tgt = tgt + self.dropout3(ff_output)
        tgt = self.norm3(tgt)
        return tgt

In [479]:
db = DecoderBlock(d_model, n_head, dropout)

In [480]:
db(embedding, embedding).size()

torch.Size([128, 32, 512])

In [481]:
class TransformerEncoder(nn.Module):
    def __init__(self, n_layer, d_model, n_head, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.ebs = nn.ModuleList([EncoderBlock(d_model, n_head, dropout) for _ in range(n_layer)])
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, src, src_mask=None):
        for eb in self.ebs:
            src = eb(src, src_mask)
        src = self.norm(src)
        return src

In [482]:
tenc = TransformerEncoder(n_enc_layer, d_model, n_head, dropout)

In [483]:
tenc(embedding).size()

torch.Size([128, 32, 512])

In [484]:
class TransformerDecoder(nn.Module):
    def __init__(self, n_layer, d_model, n_head, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.dbs = nn.ModuleList([DecoderBlock(d_model, n_head, dropout) for _ in range(n_layer)])
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        for db in self.dbs:
            tgt = db(tgt, memory, tgt_mask, memory_mask)
        tgt = self.norm(tgt)
        return tgt

In [485]:
tdec = TransformerDecoder(n_dec_layer, d_model, n_head, dropout)

In [486]:
tdec(embedding, embedding).size()

torch.Size([128, 32, 512])

In [487]:
class Transformer(nn.Module):
    def __init__(self, d_model, n_head, enc_vocab_size, dec_vocab_size, max_len, n_enc_layer, n_dec_layer, dropout=0.1):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.enc_vocab_size = enc_vocab_size
        self.dec_vocab_size = dec_vocab_size
        
        self.embedding_enc = nn.Embedding(enc_vocab_size, d_model)
        self.embedding_dec = nn.Embedding(dec_vocab_size, d_model)
        
        self.pe_enc= PositionalEmbedding(d_model, max_len)
        self.pe_dec= PositionalEmbedding(d_model, max_len)
        
        self.encoder = TransformerEncoder(n_enc_layer, d_model, n_head, dropout)
        self.decoder = TransformerDecoder(n_dec_layer, d_model, n_head, dropout)
        
        self.linear = nn.Linear(d_model, dec_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
        src = self.embedding_enc(src)
        src = self.pe_enc(src)
        memory = self.encoder(src, src_mask)

        tgt = self.embedding_dec(tgt)
        tgt = self.pe_dec(tgt)
        tgt = self.decoder(tgt, memory, tgt_mask, memory_mask)
        
        output = self.linear(tgt)
        return output

In [488]:
model = Transformer(d_model, n_head, de_vocab_size, en_vocab_size, max_len, n_enc_layer, n_dec_layer, dropout)

In [489]:
src = torch.randint(0, de_vocab_size, (batch_size, 15))
tgt = torch.randint(0, en_vocab_size, (batch_size, 20))

In [490]:
model(src, tgt).size()

torch.Size([128, 20, 31950])

In [295]:
input_ids = en_tokenizer.encode('I love iPod', padding='max_length', max_length=max_len)

In [296]:
input_ids

[101,
 178,
 3940,
 178,
 6612,
 1204,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [491]:
def create_padding_mask(seq, padding_id):
    mask = (seq == padding_id).unsqueeze(1).unsqueeze(2)
    return mask

In [492]:
padding_mask = create_padding_mask(torch.tensor(input_ids).unsqueeze(0), 0)
print(padding_mask.shape)

torch.Size([1, 1, 1, 32])


In [493]:
padding_mask

tensor([[[[False, False, False, False, False, False, False,  True,  True,  True,
            True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
            True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
            True,  True]]]])

In [494]:
def create_look_ahead_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1)
    mask = mask == 1
    return mask

In [495]:
look_ahead_mask = create_look_ahead_mask(len(input_ids))
print(look_ahead_mask.shape)

torch.Size([1, 32, 32])


In [302]:
look_ahead_mask

tensor([[[False,  True,  True,  ...,  True,  True,  True],
         [False, False,  True,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         ...,
         [False, False, False,  ..., False,  True,  True],
         [False, False, False,  ..., False, False,  True],
         [False, False, False,  ..., False, False, False]]])

In [303]:
src_input_ids = torch.zeros(batch_size, max_len).long()
src_input_ids[:src.size(0), :src.size(1)] = src
tgt_input_ids = torch.zeros(batch_size, max_len).long()
tgt_input_ids[:tgt.size(0), :tgt.size(1)] = tgt

In [304]:
src_mask = create_padding_mask(src_input_ids, 0)
tgt_mask = create_padding_mask(tgt_input_ids, 0)
look_ahead_mask = create_look_ahead_mask(tgt_input_ids.size(1))
tgt_mask = tgt_mask | look_ahead_mask

In [305]:
print(src_mask.shape, tgt_mask.shape, memory_mask.shape)

torch.Size([8, 1, 1, 32]) torch.Size([8, 1, 32, 32]) torch.Size([1, 1000, 1000])


In [308]:
output = model(src_input_ids, tgt_input_ids, src_mask=src_mask, tgt_mask=tgt_mask, memory_mask=src_mask)

In [309]:
output.shape

torch.Size([8, 32, 31951])

# Training

In [320]:
from torch import optim
from torch.utils.data import DataLoader

In [496]:
pad_id = de_tokenizer.get_vocab()['<pad>']

In [497]:
trainloader = DataLoader(de_en_ds['train'], batch_size=batch_size, shuffle=True)

In [498]:
next(iter(trainloader))

{'translation': {'de': ['Ich möchte Sie bitten, dass Sie diesem Vorschlag folgen.',
   'Der Bericht erkennt diese Notwendigkeit an und schlägt einige wichtige Maßnahmen vor, die wir als positiv ansehen, beispielsweise die Schaffung von Marktbedingungen, die günstig für lokale Produktion, Verkauf und Verbrauch sind, und die Förderung von Modellen für kurze Lieferketten frei von gentechnisch veränderten Organismen (GVO).',
   'Dieser Dateikopf wird von Adblock Plus 0.7.1 und höher unterstützt.',
   'die Eigenschaft des Geldes im direkten Zahlungsverkehr mit Münzen und Scheinen.',
   'Diese Website setzt einen sogenannten Cookie (siehe unten) auf Ihrem Rechner, dies ist erforderlich damit unser Webserver Sie von anderen Besuchern unterscheiden kann, um zum Beispiel Ihren Warenkorbinhalt von dem anderer Besucher trennen zu können.',
   'Die generellen Entscheidungen des Lehrstuhls werden in einer Versammlung der Problematisierung, Ausarbeitung, Organisation und Gestaltung getroffen.',
   '

In [499]:
validloader = DataLoader(de_en_ds['validation'], batch_size=batch_size, shuffle=True)

In [500]:
testloader = DataLoader(de_en_ds['test'], batch_size=batch_size, shuffle=True)

In [501]:
lr = 5e-5

In [502]:
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to('cuda')

In [503]:
optimizer = optim.AdamW(model.parameters(), lr=lr)

In [504]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

In [520]:
def train_epoch(model, data_loader, optimizer, criterion, device='cuda'):
    model.train()
    total_loss = 0

    for batch in data_loader:
        src = batch['translation']['de']
        tgt = batch['translation']['en']

        src_encodings = de_tokenizer(src, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids
        tgt_encodings = en_tokenizer(tgt, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

        src_input = src_encodings.to(device)
        tgt_input = tgt_encodings[:, :-1].to(device)
        tgt_output = tgt_encodings[:, 1:].to(device)

        src_mask = create_padding_mask(src_input, pad_id).to(device)
        tgt_mask = create_padding_mask(tgt_input, pad_id).to(device)
        look_ahead_mask = create_look_ahead_mask(tgt_input.size(1)).to(device)
        tgt_mask = tgt_mask | look_ahead_mask

        optimizer.zero_grad()
        pred = model(src_input, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask, memory_mask=src_mask)
        pred = pred.view(-1, pred.size(-1))
        tgt_output = tgt_output.view(-1)

        loss = criterion(pred, tgt_output)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [521]:
def evaluate(model, data_loader, criterion, device='cuda'):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in data_loader:
            src = batch['translation']['de']
            tgt = batch['translation']['en']

            src_encodings = de_tokenizer(src, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids
            tgt_encodings = en_tokenizer(tgt, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

            src_input = src_encodings.to(device)
            tgt_input = tgt_encodings[:, :-1].to(device)
            tgt_output = tgt_encodings[:, 1:].to(device)

            src_mask = create_padding_mask(src_input, pad_id).to(device)
            tgt_mask = create_padding_mask(tgt_input, pad_id).to(device)
            look_ahead_mask = create_look_ahead_mask(tgt_input.size(1)).to(device)
            tgt_mask = tgt_mask | look_ahead_mask

            pred = model(src_input, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask, memory_mask=src_mask)
            pred = pred.view(-1, pred.size(-1))
            tgt_output = tgt_output.view(-1)

            loss = criterion(pred, tgt_output)

            total_loss += loss.item()
    return total_loss / len(data_loader)

In [522]:
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs, device='cuda'):
    with trange(epochs, desc="Training", leave=False) as pbar:
        for epoch in pbar:
            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
            val_loss = evaluate(model, val_loader, criterion, device)
            pbar.set_postfix({'Train Loss': f"{train_loss:.4f}", "Valid Loss": f"{val_loss:.4f}"})

In [523]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [524]:
train_model(model, trainloader, validloader, optimizer, criterion, 10, device=device)

Training:   0%|          | 0/10 [00:00<?, ?it/s]

# Evaluate the Transformer

In [525]:
test_batch = next(iter(testloader))

In [528]:
len(test_batch['translation']['de'])

128

In [595]:
def batch_generate(model, src_texts, src_tokenizer, tgt_tokenizer, max_len, device='cuda'):
    batch_size = len(src_texts)
    
    # Tokenize and convert to tensor
    src_tokens = src_tokenizer(src_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_len, add_special_tokens=False).input_ids.to(device)

    # Initialize the target sequence with the start-of-sequence token
    tgt_start_token = tgt_tokenizer.convert_tokens_to_ids(tgt_tokenizer.cls_token)  # or <sos>
    tgt_tokens = torch.full((batch_size, 1), tgt_start_token, dtype=torch.long, device=device)

    # Initialize the memory from the encoder
    with torch.no_grad():
        src_tensor = model.module.embedding_enc(src_tokens)
        src_tensor = model.module.pe_enc(src_tensor)
        memory = model.module.encoder(src_tensor, src_mask=None)

    # Iteratively decode one token at a time
    for _ in range(max_len):
        # Generate the next token
        with torch.no_grad():
            tgt_tensor = model.module.embedding_dec(tgt_tokens)
            tgt_tensor = model.module.pe_dec(tgt_tensor)
            tgt_tensor = model.module.decoder(tgt_tensor, memory, tgt_mask=None, memory_mask=None)
            output = model.module.linear(tgt_tensor)
            
        # Take the last token in the sequence
        next_tokens = output[:, -1, :].argmax(dim=-1)

        # Append the generated token to the target sequence
        tgt_tokens = torch.cat((tgt_tokens, next_tokens.unsqueeze(1)), dim=-1)

        # Stop if we reach the end-of-sequence token
        if torch.all(next_tokens == tgt_tokenizer.eos_token_id):
            break

    # Convert the tensor back to tokens and then to a string
    translated_texts = tgt_tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    
    return translated_texts

In [596]:
pred_texts = batch_generate(model, test_batch['translation']['de'], de_tokenizer, en_tokenizer, max_len, device='cuda')

In [597]:
pred_texts

['roulston, ske robert roulston, wife of robert roulston, wife of robert',
 "the concentration of mercury in fish, rhine and danube's permanent and wide - area - - - - - - - - - - -",
 'concerns about china ’ s long - term growth in china have fallen by about half the economy, and in the long term, in the longer',
 'formerly they were one of the best teams in baseball, today the tigers on the last place in american central american league',
 'according to mr lowen, diagnosis " is " four weeks forced ", " the diagnosis ". ", ", ", ", "',
 'the city of the region of the region of the region of the region of the region of the region of the region of the region of the region of',
 'most likely, such a relatively low pr - task was delegated to a helper who was the lorax did not 20 times the',
 'what there is : do you need something - a little - thai - - - - - - - - - - - - - - - -',
 'food wholesaler metcash has launched a food safety campaign to launch a ground - based campaign to market t

In [598]:
test_batch['translation']['en']

['Robert Roulston, former chairman of RVL, David Moodie, current chairman of RVL, and Bernard Saundry, current CEO of RVL, were aware of that at the time and are aware of that today.',
 'The concentration of mercury in fish, for example in the Elbe, Rhine and Danube, is "extensively exceeded on a long-term basis," the "Spiegel" newspaper quoted from the paper in March.',
 'Concerns over prolonged stalled Chinese growth have slashed iron ore prices by roughly a half, as coal, copper and other commodities have fallen by 20 to 40 percent.',
 'Once one of the best teams in baseball, the Tigers are now in last place in the American League Central.',
 'According to the Munich Lions, the diagnosis means "probably a four week compulsory break".',
 'Contruction area planned in the town',
 'Most likely, such a relatively lowly PR task would have been outsourced to an aide, seemingly one who has never read The Lorax 20 times in a row to a child.',
 'What it stocks: Need something - anything - Tha

In [601]:
def test_bleu(dataloader, model, src_tokenizer, tgt_tokenizer, max_len, device='cuda'):
    all_pred_texts = []
    all_ref_texts = []
    
    for test_batch in tqdm(dataloader, total=len(dataloader), desc="Test Bleu Score"):
        src_texts = test_batch['translation']['de']
        ref_texts = test_batch['translation']['en']
        
        # Translate the current minibatch
        pred_texts = batch_generate(model, src_texts, src_tokenizer, tgt_tokenizer, max_len, device=device)
        
        # Store predictions and references
        all_pred_texts.extend(pred_texts)
        all_ref_texts.extend(ref_texts)
        
    bleu = sacrebleu.corpus_bleu(all_pred_texts, [all_ref_texts])
    return bleu

In [602]:
with torch.no_grad():
    bleu = test_bleu(testloader, model, de_tokenizer, en_tokenizer, max_len, device='cuda')
print(f"BLEU score: {bleu.score}")

Test Bleu Score:   0%|          | 0/24 [00:00<?, ?it/s]

BLEU score: 8.16466431175122
