In [1]:
!pip install sentencepiece --quiet
!pip install sacrebleu --quiet
!pip install torchdata --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# !pip uninstall -y torch torchvision torchdata torchaudio


[0m

In [None]:
# import torch
# print(torch.__version__)
# print(torch.cuda.is_available())


In [None]:
# !pip install torch==1.13.1+cu117 torchtext==0.14.1 torchdata -f https://download.pytorch.org/whl/torch_stable.html


In [17]:
train_path= '/content/drive/MyDrive/Projects/Train'
valid_path= '/content/drive/MyDrive/Projects/Valid'

In [5]:
import math
from dataclasses import dataclass
import numpy as np
import sacrebleu
import sentencepiece as spm
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from tqdm import tqdm

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [6]:
SRC = "de"
TRG = "en"
BOS, EOS, PAD = 1, 2, 3
batch_size = 128
max_seq_len = 50

en_vocab_size = 8200
de_vocab_size = 10000
vocab_sizes = {"en": en_vocab_size, "de": de_vocab_size}

In [26]:
import sentencepiece as spm

class SentencePieceProcessor:
    def __init__(self, src_language, trg_language, de_vocab_size=10000, en_vocab_size=8200):
        self.SRC = src_language
        self.TRG = trg_language
        self.de_vocab_size = de_vocab_size
        self.en_vocab_size = en_vocab_size
        self.vocab_sizes = {"en": self.en_vocab_size, "de": self.de_vocab_size}
        self.tokenizers = {}
        self.detokenizers = {}

    def prepare_data(self, train_src_path, train_trg_path, valid_src_path, valid_trg_path):

        # Create cleaned text files for training
        with open(train_src_path, "r") as train_src, open(train_trg_path, "r") as train_trg:
            with open("Multi30k_train_de_text.txt", "w") as out_de, open("Multi30k_train_en_text.txt", "w") as out_en:
                for de_line, en_line in zip(train_src, train_trg):
                    out_de.write(de_line.strip() + '\n')
                    out_en.write(en_line.strip() + '\n')

        # Create cleaned text files for validation
        with open(valid_src_path, "r") as valid_src, open(valid_trg_path, "r") as valid_trg:
            with open("Multi30k_valid_de_text.txt", "w") as out_de, open("Multi30k_valid_en_text.txt", "w") as out_en:
                for de_line, en_line in zip(valid_src, valid_trg):
                    out_de.write(de_line.strip() + '\n')
                    out_en.write(en_line.strip() + '\n')

    def train_sentencepiece(self):
        """
        Trains SentencePiece models for both source (de) and target (en) languages using the prepared training text.
        """
        # Train SentencePiece for German (de)
        spm.SentencePieceTrainer.train(f'--input=Multi30k_train_de_text.txt --model_prefix=Multi30k_de --user_defined_symbols=<pad> --vocab_size={self.de_vocab_size}')

        # Train SentencePiece for English (en)
        spm.SentencePieceTrainer.train(f'--input=Multi30k_train_en_text.txt --model_prefix=Multi30k_en --user_defined_symbols=<pad> --vocab_size={self.en_vocab_size}')

        # Load trained SentencePiece models
        de_sp = spm.SentencePieceProcessor()
        de_sp.load('Multi30k_de.model')
        en_sp = spm.SentencePieceProcessor()
        en_sp.load('Multi30k_en.model')

        # Set tokenizers and detokenizers
        self.tokenizers = {"de": de_sp, "en": en_sp}
        self.detokenizers = {"de": de_sp.decode_ids, "en": en_sp.decode_ids}

    def get_special_symbols_indexes(self):
        """
        Returns a dictionary of special symbol indexes (e.g., UNK, BOS, EOS, PAD).
        """
        return {"UNK": 0, "BOS": 1, "EOS": 2, "PAD": 3}


In [27]:
# Using the class without a main function
# Initialize the SentencePieceProcessor

train_src_path='/content/drive/MyDrive/Projects/Train/train.de'
train_trg_path='/content/drive/MyDrive/Projects/Train/train.en'
valid_src_path='/content/drive/MyDrive/Projects/Valid/val.de'
valid_trg_path='/content/drive/MyDrive/Projects/Valid/val.en'

src_language = "de"
trg_language = "en"

sp_processor = SentencePieceProcessor(src_language, trg_language)

# Prepare data for training and validation
sp_processor.prepare_data(train_src_path, train_trg_path, valid_src_path, valid_trg_path)

# Train SentencePiece tokenizers
sp_processor.train_sentencepiece()

# Get special symbols (e.g., UNK, BOS, EOS, PAD)
special_symbols = sp_processor.get_special_symbols_indexes()

# Example: Tokenize a sentence using the trained tokenizer
sentence = "Das ist ein Beispiel."
tokenized_sentence = sp_processor.tokenizers["de"].encode_as_ids(sentence)
print("Tokenized sentence:", tokenized_sentence)


Tokenized sentence: [251, 57, 13, 669, 1359, 4]


In [28]:
# Prints tokenized examples of the first few sentence pairs.
with open("Multi30k_en_text.txt", "r") as f_en, open("Multi30k_de_text.txt", "r") as f_de:
  print("Tokenized sentence pairs:")
  for _ in range(5):
      en_line = f_en.readline().strip()
      de_line = f_de.readline().strip()

      # Tokenize each line using SentencePiece tokenizer
      en_pieces = sp_processor.tokenizers["en"].encode_as_pieces(en_line)
      en_tokens = sp_processor.tokenizers["en"].encode_as_ids(en_line)

      de_pieces = sp_processor.tokenizers["de"].encode_as_pieces(de_line)
      de_tokens = sp_processor.tokenizers["de"].encode_as_ids(de_line)
      print(f"English: {en_line}")
      print(f"Tokenized English: {en_pieces}")
      print(f"Token to id's English: {en_tokens}")
      print(f"German: {de_line}")
      print(f"Tokenized German: {de_pieces}")
      print(f"Token to id's German: {de_tokens}")

      print("\n" + "-"*50 + "\n")

Tokenized sentence pairs:
English: Two young, White males are outside near many bushes.
Tokenized English: ['▁Two', '▁young', ',', '▁White', '▁males', '▁are', '▁outside', '▁near', '▁many', '▁bushes', '.']
Token to id's English: [22, 28, 18, 1317, 936, 20, 62, 89, 404, 1519, 5]
German: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
Tokenized German: ['▁Zwei', '▁junge', '▁weiße', '▁Männer', '▁sind', '▁im', '▁Freien', '▁in', '▁der', '▁Nähe', '▁viele', 'r', '▁Büsche', '.']
Token to id's German: [25, 93, 137, 38, 99, 27, 107, 8, 18, 126, 280, 33, 4184, 4]

--------------------------------------------------

English: Several men in hard hats are operating a giant pulley system.
Tokenized English: ['▁Se', 'veral', '▁men', '▁in', '▁hard', '▁hats', '▁are', '▁operating', '▁a', '▁g', 'iant', '▁pulley', '▁system', '.']
Token to id's English: [309, 197, 39, 7, 356, 339, 20, 1361, 4, 394, 1597, 4134, 3127, 5]
German: Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.


In [29]:
import os
import torch

class DataPreparation:
    def __init__(self, train_src_path, train_trg_path, valid_src_path, valid_trg_path, tokenizers, special_symbols, max_seq_len=50):
        self.SRC = "de"  # German source language
        self.TRG = "en"  # English target language
        self.train_src_path = train_src_path
        self.train_trg_path = train_trg_path
        self.valid_src_path = valid_src_path
        self.valid_trg_path = valid_trg_path
        self.tokenizers = tokenizers  # Tokenizers for "de" and "en"
        self.special_symbols = special_symbols  # Special symbols indexes (e.g., BOS, EOS, PAD)
        self.max_seq_len = max_seq_len  # Max sequence length for tokenization

        # Load and prepare datasets
        self.train_set = self._prepare_set(self.train_src_path, self.train_trg_path)
        self.valid_set = self._prepare_set(self.valid_src_path, self.valid_trg_path)

    def _prepare_set(self, src_file_path, trg_file_path):
        """Reads sentences from source and target files and pairs them."""
        if not os.path.exists(src_file_path) or not os.path.exists(trg_file_path):
            raise FileNotFoundError("Source or target file not found.")

        # Read source and target files
        with open(src_file_path, 'r', encoding='utf-8') as src_file, \
             open(trg_file_path, 'r', encoding='utf-8') as trg_file:
            src_lines = src_file.readlines()
            trg_lines = trg_file.readlines()

        # Ensure both files have the same number of lines
        if len(src_lines) != len(trg_lines):
            raise ValueError("Source and target files have different line counts.")

        # Pair sentences and remove trailing newlines
        data_set = [(src.strip(), trg.strip()) for src, trg in zip(src_lines, trg_lines) if src.strip() and trg.strip()]
        return data_set

    def tokenize_dataset(self, dataset):
        """
        Tokenizes the dataset using the SentencePiece tokenizers.
        Each sequence is wrapped with BOS and EOS tokens and truncated to max_seq_len.
        """
        BOS = self.special_symbols["BOS"]
        EOS = self.special_symbols["EOS"]

        tokenized_dataset = []
        for src_text, trg_text in dataset:
            src_tokens = [BOS] + self.tokenizers["de"].encode_as_ids(src_text)[:self.max_seq_len-2] + [EOS]
            trg_tokens = [BOS] + self.tokenizers["en"].encode_as_ids(trg_text)[:self.max_seq_len-2] + [EOS]
            tokenized_dataset.append((torch.tensor(src_tokens), torch.tensor(trg_tokens)))

        return tokenized_dataset

    def print_data_info(self):
        """Prints dataset information and a few examples."""
        print("Number of training examples:", len(self.train_set))
        print("Number of validation examples:", len(self.valid_set))

        print("\nTraining Examples:")
        for i, (src, trg) in enumerate(self.train_set[:5]):
            print(f"Example {i+1}:")
            print(f"  SRC: {src}")
            print(f"  TRG: {trg}")

        print("\nValidation Examples:")
        for i, (src, trg) in enumerate(self.valid_set[:5]):
            print(f"Example {i+1}:")
            print(f"  SRC: {src}")
            print(f"  TRG: {trg}")


In [30]:
# Tokenizers and special symbols (Assume you have trained SentencePiece models)
tokenizers = {
    "de": sp_processor.tokenizers["de"],  # SentencePiece tokenizer for German
    "en": sp_processor.tokenizers["en"]   # SentencePiece tokenizer for English
}
special_symbols = sp_processor.get_special_symbols_indexes()  # e.g., {"BOS": 1, "EOS": 2, "PAD": 3}

# Initialize DataPreparation
data_preparation = DataPreparation(
    train_src_path=train_src_path,
    train_trg_path=train_trg_path,
    valid_src_path=valid_src_path,
    valid_trg_path=valid_trg_path,
    tokenizers=tokenizers,
    special_symbols=special_symbols,
    max_seq_len=50  # Maximum sequence length
)

# Print dataset information
data_preparation.print_data_info()

# Tokenize datasets
tokenized_train_set = data_preparation.tokenize_dataset(data_preparation.train_set)
tokenized_valid_set = data_preparation.tokenize_dataset(data_preparation.valid_set)

# Example: Print the first tokenized train and valid example
print("First tokenized training example:")
print("  SRC:", tokenized_train_set[0][0])
print("  TRG:", tokenized_train_set[0][1])

print("\nFirst tokenized validation example:")
print("  SRC:", tokenized_valid_set[0][0])
print("  TRG:", tokenized_valid_set[0][1])


Number of training examples: 29000
Number of validation examples: 1014

Training Examples:
Example 1:
  SRC: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
  TRG: Two young, White males are outside near many bushes.
Example 2:
  SRC: Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.
  TRG: Several men in hard hats are operating a giant pulley system.
Example 3:
  SRC: Ein kleines Mädchen klettert in ein Spielhaus aus Holz.
  TRG: A little girl climbing into a wooden playhouse.
Example 4:
  SRC: Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.
  TRG: A man in a blue shirt is standing on a ladder cleaning a window.
Example 5:
  SRC: Zwei Männer stehen am Herd und bereiten Essen zu.
  TRG: Two men are at the stove preparing food.

Validation Examples:
Example 1:
  SRC: Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen
  TRG: A group of men are loading cotton onto a truck
Example 2:
  SRC: Ein Mann schläft in einem grünen

In [31]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def pad_sequence(batch, PAD):
    src_seqs = [src for src, trg in batch]
    trg_seqs = [trg for src, trg in batch]
    src_padded = torch.nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=PAD)
    trg_padded = torch.nn.utils.rnn.pad_sequence(trg_seqs, batch_first=True, padding_value=PAD)
    return src_padded, trg_padded

class Dataloaders:
    def __init__(self, train_tokenized, valid_tokenized, batch_size, PAD):
        self.train_dataset = TranslationDataset(train_tokenized)
        self.valid_dataset = TranslationDataset(valid_tokenized)

        self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_size=batch_size,
                                                        shuffle=True, collate_fn=lambda x: pad_sequence(x, PAD))

        self.valid_loader = torch.utils.data.DataLoader(self.valid_dataset, batch_size=batch_size,
                                                        shuffle=True, collate_fn=lambda x: pad_sequence(x, PAD))

In [32]:
# Create dataloaders
data_loaders = Dataloaders(tokenized_train_set, tokenized_valid_set, batch_size, PAD)

In [None]:
layer=nn.Linear(50,50)
x=torch.rand(50)
x.size()

In [33]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_embed, dropout=0.0):
        super(MultiHeadedAttention, self).__init__()
        #super().__init__()  python 3.x
        assert d_embed % h == 0 # check the h number
        self.d_k = d_embed//h
        self.d_embed = d_embed    # 512
        self.h = h  # 8
        self.WQ = nn.Linear(d_embed, d_embed)
        self.WK = nn.Linear(d_embed, d_embed)
        self.WV = nn.Linear(d_embed, d_embed)
        self.linear = nn.Linear(d_embed, d_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x_query, x_key, x_value, mask=None):
        nbatch = x_query.size(0) # get batch size
        query = self.WQ(x_query).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        key   = self.WK(x_key).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        value = self.WV(x_value).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        # attention scores has dimensions: nbatch * h * seq_len * seq_len
        scores = torch.matmul(query, key.transpose(-2, -1))/math.sqrt(self.d_k)
        # Mask out padding tokens
        if mask is not None:
            scores = scores.masked_fill(mask, float('-inf'))
        # p_atten dimensions: nbatch * h * seq_len * seq_len
        p_atten = torch.nn.functional.softmax(scores, dim=-1) # attention filter
        p_atten = self.dropout(p_atten)
        # x dimensions: nbatch * h * seq_len * d_k
        x = torch.matmul(p_atten, value)  # filtered values
        # x now has dimensions:nbatch * seq_len * d_embed
        x = x.transpose(1, 2).contiguous().view(nbatch, -1, self.d_embed)
        return self.linear(x) # final linear layer

In [34]:
class ResidualConnection(nn.Module):
    def __init__(self, dim, dropout):
        super().__init__()
        self.drop = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(dim)  # (x-M)/std

    def forward(self, x, sublayer):
        return x + self.drop(sublayer(self.norm(x)))

In [35]:
class Encoder(nn.Module):
    '''Encoder = token embedding + positional embedding -> a stack of N EncoderBlock -> layer norm'''
    def __init__(self, config):
        super().__init__()
        self.d_embed = config.d_embed  # 512
        self.tok_embed = nn.Embedding(config.encoder_vocab_size, config.d_embed) # Vocab Dictionary size , Embed size
        self.pos_embed = nn.Parameter(torch.zeros(1, config.max_seq_len, config.d_embed))
        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config.N_encoder)])
        self.dropout = nn.Dropout(config.dropout)
        self.norm = nn.LayerNorm(config.d_embed)

    def forward(self, input, mask=None):
        x = self.tok_embed(input) # Vectors
        x_pos = self.pos_embed[:, :x.size(1), :]  # Vectors'
        x = self.dropout(x + x_pos) # update vectors with position information
        for layer in self.encoder_blocks:
            x = layer(x, mask) # (50,512)
        return self.norm(x)


class EncoderBlock(nn.Module):
    '''EncoderBlock: self-attention -> position-wise fully connected feed-forward layer'''
    def __init__(self, config):
        super(EncoderBlock, self).__init__()
        self.atten = MultiHeadedAttention(config.h, config.d_embed, config.dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(config.d_embed, config.d_ff),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_ff, config.d_embed)
        )
        self.residual1 = ResidualConnection(config.d_embed, config.dropout)
        self.residual2 = ResidualConnection(config.d_embed, config.dropout)

    def forward(self, x, mask=None):
        # self-attention
        x = self.residual1(x, lambda x: self.atten(x, x, x, mask=mask))
        # position-wise fully connected feed-forward layer
        return self.residual2(x, self.feed_forward)


In [36]:
class Decoder(nn.Module):
    '''Decoder = token embedding + positional embedding -> a stack of N DecoderBlock -> fully-connected layer'''
    def __init__(self, config):
        super().__init__()
        self.d_embed = config.d_embed
        self.tok_embed = nn.Embedding(config.decoder_vocab_size, config.d_embed)
        self.pos_embed = nn.Parameter(torch.zeros(1, config.max_seq_len, config.d_embed))
        self.dropout = nn.Dropout(config.dropout)
        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config.N_decoder)])
        self.norm = nn.LayerNorm(config.d_embed)
        self.linear = nn.Linear(config.d_embed, config.decoder_vocab_size)


    def future_mask(self, seq_len):
        '''mask out tokens at future positions'''
        mask = (torch.triu(torch.ones(seq_len, seq_len, requires_grad=False), diagonal=1)!=0).to(DEVICE)
        return mask.view(1, 1, seq_len, seq_len)

    def forward(self, memory, src_mask, trg, trg_pad_mask):
        seq_len = trg.size(1)
        trg_mask = torch.logical_or(trg_pad_mask, self.future_mask(seq_len))
        x = self.tok_embed(trg) + self.pos_embed[:, :trg.size(1), :]
        x = self.dropout(x)
        for layer in self.decoder_blocks:
            x = layer(memory, src_mask, x, trg_mask)
        x = self.norm(x)
        logits = self.linear(x)
        return logits


class DecoderBlock(nn.Module):
    ''' DecoderBlock: self-attention -> position-wise feed-forward (fully connected) layer'''
    def __init__(self, config):
        super().__init__()
        self.atten1 = MultiHeadedAttention(config.h, config.d_embed)
        self.atten2 = MultiHeadedAttention(config.h, config.d_embed)
        self.feed_forward = nn.Sequential(
            nn.Linear(config.d_embed, config.d_ff),
            nn.ReLU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_ff, config.d_embed)
        )
        self.residuals = nn.ModuleList([ResidualConnection(config.d_embed, config.dropout)
                                       for _ in range(3)])

    def forward(self, memory, src_mask, decoder_layer_input, trg_mask):
        x = memory  # K , V
        y = decoder_layer_input # target /y "he"
        y = self.residuals[0](y, lambda y: self.atten1(y, y, y, mask=trg_mask)) #masked multi head attention
        # keys and values are from the encoder output
        y = self.residuals[1](y, lambda y: self.atten2(y, x, x, mask=src_mask))
        return self.residuals[2](y, self.feed_forward)


In [37]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, src_mask, trg, trg_pad_mask):
        return self.decoder(self.encoder(src, src_mask), src_mask, trg, trg_pad_mask)

In [38]:
# Model configuration and creation
@dataclass
class ModelConfig:
    encoder_vocab_size: int
    decoder_vocab_size: int
    d_embed: int
    d_ff: int
    h: int
    N_encoder: int
    N_decoder: int
    max_seq_len: int
    dropout: float

def make_model(config):
    model = Transformer(Encoder(config), Decoder(config)).to(DEVICE)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

def make_batch_input(x, y):
        src = x.to(DEVICE)
        trg_in = y[:, :-1].to(DEVICE)
        trg_out = y[:, 1:].contiguous().view(-1).to(DEVICE)
        src_pad_mask = (src == PAD).view(src.size(0), 1, 1, src.size(-1))
        trg_pad_mask = (trg_in == PAD).view(trg_in.size(0), 1, 1, trg_in.size(-1))
        return src, trg_in, trg_out, src_pad_mask, trg_pad_mask

# Placeholder for train and evaluate functions
def train_epoch(model, dataloaders):
    model.train()
    grad_norm_clip = 1.0
    losses = []
    pbar = tqdm(enumerate(dataloaders.train_loader), total=len(dataloaders.train_loader))
    for idx, (x, y) in pbar:
        optimizer.zero_grad()
        src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x, y)
        pred = model(src, src_pad_mask, trg_in, trg_pad_mask).to(DEVICE)
        pred = pred.view(-1, pred.size(-1))
        loss = loss_fn(pred, trg_out).to(DEVICE)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip)
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
        if idx > 0 and idx % 50 == 0:
            pbar.set_description(f'train loss={loss.item():.3f}, lr={scheduler.get_last_lr()[0]:.5f}')
    return np.mean(losses)

def train(model, dataloaders, epochs):
    global early_stop_count
    best_valid_loss = float('inf')
    train_size = len(dataloaders.train_loader) * batch_size
    for ep in range(epochs):
        train_loss = train_epoch(model, dataloaders)
        valid_loss = validate(model, dataloaders.valid_loader)

        print(f'ep: {ep}: train_loss={train_loss:.5f}, valid_loss={valid_loss:.5f}')
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
        else:
            if scheduler.last_epoch > 2 * warmup_steps:
                early_stop_count -= 1
                if early_stop_count <= 0:
                    return train_loss, valid_loss
    return train_loss, valid_loss

def validate(model, dataloder):
    model.eval()
    losses = []
    with torch.no_grad():
        for i, (x, y) in enumerate(dataloder):
            src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x, y)
            pred = model(src, src_pad_mask, trg_in, trg_pad_mask).to(DEVICE)
            pred = pred.view(-1, pred.size(-1))
            losses.append(loss_fn(pred, trg_out).item())
    return np.mean(losses)

def translate(model, x, detokenizers):
    'translate source sentences into the target language, without looking at the answer'
    with torch.no_grad():
        dB = x.size(0)
        y = torch.tensor([[BOS]*dB]).view(dB, 1).to(DEVICE)
        x_pad_mask = (x == PAD).view(x.size(0), 1, 1, x.size(-1)).to(DEVICE)
        memory = model.encoder(x, x_pad_mask)
        for i in range(max_seq_len):
            y_pad_mask = (y == PAD).view(y.size(0), 1, 1, y.size(-1)).to(DEVICE)
            logits = model.decoder(memory, x_pad_mask, y, y_pad_mask)
            last_output = logits.argmax(-1)[:, -1]
            last_output = last_output.view(dB, 1)
            y = torch.cat((y, last_output), 1).to(DEVICE)
    return y

def remove_pad(sent):
    '''truncate the sentence if BOS is in it,
     otherwise simply remove the padding tokens at the end'''
    if sent.count(EOS)>0:
        sent = sent[0:sent.index(EOS)+1]
    while sent and sent[-1] == PAD:
        sent = sent[:-1]
    return sent

def decode_sentence(detokenizer, sentence_ids):
    'convert a tokenized sentence (a list of numbers) to a literal string'
    if not isinstance(sentence_ids, list):
        sentence_ids = sentence_ids.tolist()
    sentence_ids = remove_pad(sentence_ids)
    return detokenizer(sentence_ids).replace("<bos>", "")\
           .replace("<eos>", "").strip().replace(" .", ".")

def evaluate(model, dataloader, detokenizers, num_batch=None):
    'evaluate the model, and compute the BLEU score'
    model.eval()
    refs, cans, bleus = [], [], []
    with torch.no_grad():
        for idx, (x, y) in enumerate(dataloader):
            src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x, y)
            translation = translate(model, src, detokenizers)
            trg_out = trg_out.view(x.size(0), -1)
            refs = refs + [decode_sentence(detokenizers[TRG], trg_out[i]) for i in range(len(src))]
            cans = cans + [decode_sentence(detokenizers[TRG], translation[i]) for i in range(len(src))]
            if num_batch and idx>=num_batch:
                break
        print(min([len(x) for x in refs]))
        bleus.append(sacrebleu.corpus_bleu(cans, [refs]).score)
        # print some examples
        for i in range(3):
            print(f'src:  {decode_sentence(detokenizers[SRC], src[i])}')
            print(f'trg:  {decode_sentence(detokenizers[TRG], trg_out[i])}')
            print(f'pred: {decode_sentence(detokenizers[TRG], translation[i])}')
        return np.mean(bleus)

In [42]:
detokenizers = {
    SRC: sp_processor.tokenizers[SRC].decode_ids,  # German detokenizer
    TRG: sp_processor.tokenizers[TRG].decode_ids  # English detokenizer
}

config = ModelConfig(
    encoder_vocab_size=vocab_sizes[SRC],
    decoder_vocab_size=vocab_sizes[TRG],
    d_embed=512,
    d_ff=512,
    h=8,
    N_encoder=6,
    N_decoder=6,
    max_seq_len=max_seq_len,
    dropout=0.1
)

train_size = len(data_loaders.train_loader) * batch_size

# Create model
model = make_model(config)
model_size = sum([p.numel() for p in model.parameters()])
print(f'model_size: {model_size}, train_set_size: {train_size}')

# Learning rate scheduler setup
warmup_steps = 3 * len(data_loaders.train_loader)
lr_fn = lambda step: config.d_embed ** -0.5 * min([(step + 1) ** -0.5, (step + 1) * warmup_steps ** -1.5])
optimizer = torch.optim.Adam(model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)

# Training parameters
early_stop_count = 10

# Train the model
train_loss, valid_loss = train(model, data_loaders, epochs=50)

print("train set examples:")
train_bleu = evaluate(model, data_loaders.train_loader, detokenizers, 20)
print("validation set examples:")
valid_bleu = evaluate(model, data_loaders.valid_loader, detokenizers)

model_size: 38823944, train_set_size: 29056


train loss=3.796, lr=0.00025: 100%|██████████| 227/227 [01:04<00:00,  3.50it/s]


ep: 0: train_loss=5.51753, valid_loss=3.73809


train loss=2.746, lr=0.00053: 100%|██████████| 227/227 [01:04<00:00,  3.53it/s]


ep: 1: train_loss=3.15520, valid_loss=2.63824


train loss=2.201, lr=0.00082: 100%|██████████| 227/227 [01:04<00:00,  3.53it/s]


ep: 2: train_loss=2.28024, valid_loss=2.17410


train loss=1.812, lr=0.00074: 100%|██████████| 227/227 [01:04<00:00,  3.53it/s]


ep: 3: train_loss=1.81368, valid_loss=1.94048


train loss=1.450, lr=0.00066: 100%|██████████| 227/227 [01:04<00:00,  3.52it/s]


ep: 4: train_loss=1.47925, valid_loss=1.85235


train loss=1.311, lr=0.00060: 100%|██████████| 227/227 [01:04<00:00,  3.50it/s]


ep: 5: train_loss=1.24379, valid_loss=1.80811


train loss=1.132, lr=0.00056: 100%|██████████| 227/227 [01:04<00:00,  3.52it/s]


ep: 6: train_loss=1.06370, valid_loss=1.81389


train loss=0.970, lr=0.00052: 100%|██████████| 227/227 [01:04<00:00,  3.54it/s]


ep: 7: train_loss=0.91957, valid_loss=1.82270


train loss=0.806, lr=0.00049: 100%|██████████| 227/227 [01:03<00:00,  3.55it/s]


ep: 8: train_loss=0.80130, valid_loss=1.85435


train loss=0.757, lr=0.00047: 100%|██████████| 227/227 [01:04<00:00,  3.54it/s]


ep: 9: train_loss=0.69876, valid_loss=1.90574


train loss=0.631, lr=0.00044: 100%|██████████| 227/227 [01:04<00:00,  3.54it/s]


ep: 10: train_loss=0.61593, valid_loss=1.94202


train loss=0.584, lr=0.00043: 100%|██████████| 227/227 [01:03<00:00,  3.55it/s]


ep: 11: train_loss=0.54530, valid_loss=1.98453


train loss=0.559, lr=0.00041: 100%|██████████| 227/227 [01:04<00:00,  3.53it/s]


ep: 12: train_loss=0.48279, valid_loss=2.04879


train loss=0.493, lr=0.00039: 100%|██████████| 227/227 [01:04<00:00,  3.52it/s]


ep: 13: train_loss=0.43000, valid_loss=2.09650


train loss=0.460, lr=0.00038: 100%|██████████| 227/227 [01:04<00:00,  3.54it/s]


ep: 14: train_loss=0.38626, valid_loss=2.15867


train loss=0.376, lr=0.00037: 100%|██████████| 227/227 [01:04<00:00,  3.52it/s]


ep: 15: train_loss=0.34530, valid_loss=2.19668
train set examples:
19
src:  Ein Mann, der einen grünen Jeep fährt, fährt über große Steine.
trg:  A man driving a green jeep is crossing over large rocks.
pred: A man driving a green jeep is driving over large rocks.
src:  Eine Frau in einem blauen Pullover hält ein großes Stück braunes Papier.
trg:  Woman wearing a blue sweater while holding a large piece of brown paper material.
pred: A woman in a blue sweater holds a large piece of brown paper.
src:  Ein Paar steht da und blickt auf das Meer.
trg:  A couple stand and looks at the ocean.
pred: A couple stands and looks at the ocean.
validation set examples:
20
src:  Ein schwarzweißer Hund schwimmt im klaren Wasser.
trg:  A black and white dog swimming in clear water.
pred: A black and white dog is swimming in the clear water.
src:  Eine Gruppe junger asiatischer Männer läuft bei einem Marathon.
trg:  A group of young Asian men walking in a marathon.
pred: A group of young Asian men are 

In [43]:
def translate_this_sentence(text: str, model, tokenizers, detokenizers):
    'translate the source sentence in string formate into target language'
    input = torch.tensor([[BOS] + tokenizers[SRC](text) + [EOS]]).to(DEVICE)
    output = translate(model, input, detokenizers)
    return decode_sentence(detokenizers[TRG], output[0])

In [47]:
def translate_this_sentence(sentence, model, tokenizers, detokenizers):
    """
    Translates a given sentence using the trained model.

    Args:
    - sentence (str): Input sentence in the source language.
    - model (nn.Module): Trained Transformer model.
    - tokenizers (dict): Tokenizers for source and target languages.
    - detokenizers (dict): Detokenizers for source and target languages.

    Returns:
    - str: Translated sentence in the target language.
    """
    # Tokenize the input sentence
    src_tokens = tokenizers["de"].encode_as_ids(sentence)  # Tokenize German sentence
    src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(DEVICE)

    # Translate the sentence
    translated_tensor = translate(model, src_tensor, detokenizers)  # Get translated token IDs

    # Decode the translated tokens into a sentence
    translated_sentence = detokenizers["en"](translated_tensor[0].tolist())  # Use correct detokenizer
    return translated_sentence


In [48]:
translated_sentence = translate_this_sentence(
    "Eine Gruppe von Menschen steht vor einem Iglu.",  # German input
    model,  # Your trained Transformer model
    tokenizers,  # Tokenizers dictionary
    detokenizers  # Detokenizers dictionary
)
print(translated_sentence)



Group of people standing in front of an igloo.......... air..


In [49]:
torch.save(model.state_dict(), 'transformer_model.pth')