# Transformer_PyTorch_From_Scratch

### Test GPU compatibility

In [17]:
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

2.5.1+cu121
CUDA available: True
Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [18]:
# pytorch Main module - for tensor operations and deep learning - Essential for building transformer from scratch
# because all operations (attention, matrix multiplication, layers) are built using PyTorch tensors
import torch

# This imports PyTorch’s neural network module. It contains building blocks such as: nn.Linear, nn.Embedding, nn.LayerNorm
# nn.Dropout, nn.Modul, Transformers heavily rely on these layers
import torch.nn as nn

# Imports optimizers like: optim.Adam, optim.SGD, optim.AdamW (often used for Transformers).
# Required for updating model weights during training using backpropagation.
import torch.optim as optim

# Used for handling compressed files (.gz)
import gzip

# Provides functions to work with time. Used to:
# measure training speed, compute time per epoch, log ETA/output intervals
# Example: start = time.time(), elapsed = time.time() - start.
import time

# Gives access to mathematical functions such as: math.sqrt, math.log, math.pi
import math

# import spacy library. It is for text-processing library. Often used for: tokenization, sentence splitting, cleaning text
import spacy

# Helps build custom datasets and efficiently load batches
from torch.utils.data import Dataset, DataLoader

# importing pad_seq for maintain the seq len as it is crucial for NLP
from torch.nn.utils.rnn import pad_sequence

# visualize the progress bar
from tqdm import tqdm

#import warnings
import warnings
warnings.filterwarnings("ignore")

In [19]:
# Load the language models
# 1. de - German language
# 2. en - Eng Language
# sm - Small maodel
# loading the lang. model for english and german language and both are small model
# This i am using to tokenize the src and trg sequence
# My task is MT from german to english so from spacy i am going to download the sm for these langiage so i can apply some #functions to tokenize the data

spacy_de=spacy.load('de_core_news_sm')
spacy_en=spacy.load('en_core_web_sm')

In [20]:
# By default, PyTorch uses non-deterministic algorithms to optimize performance.
# sometimes this can lead to different results on different runs with the same input.
# To ensure reproducibility, we can set the following configurations:

SEED=123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [21]:
# Task --> MachineTranslation
# src_file --> German sentences
# trg_file --> English sentences
# load the dataset
# here i am using dataset which is in zip format as it needs huge data, Transformer can't be trained of small dataset.

# load the dataset
# Create a custom dataset class which inherits from PyTorch's Dataset
# initialize the dataset with source and target files and optional transformations
class Multi30kDataset(Dataset):
    def __init__(self,src_file, trg_file, src_transform=None, trg_transform=None):
        self.src_data=self.load_data(src_file) #load source data
        self.trg_data=self.load_data(trg_file) #load target data
        self.src_transform=src_transform       #optional transformation for source data -- Tokenization and lowercase
        self.trg_transform=trg_transform       #optional transformation for source data -- Tokenization and lowercase

    # function to load data from a gzip file
    def load_data(self, file_path):
        with gzip.open(file_path, 'rt', encoding = 'utf-8') as f: # open the gzip file at read text mode with encoding utf-8
            data = f.readlines() # read all lines from the file
        return data # return the list of lines

    # return the length of the dataset
    #This tells PyTorch how many samples are in your dataset.
    def __len__(self):
        return len(self.src_data)

    # Retrieves the specific data idx-th German and English sentence.
    def __getitem__(self, idx):
        src_sentence = self.src_data[idx].strip() # get the source sentence at index idx and strip whitespace
        trg_sentence = self.trg_data[idx].strip() # get the target sentence at index idx and strip whitespace

        # apply transformations if provided
        if self.src_transform:
            src_sentence = self.src_transform(src_sentence)
        if self.trg_transform:
            trg_sentence = self.trg_transform(trg_sentence)

        return {"src": src_sentence, "trg": trg_sentence}

In [22]:
# tokenization and lowercase functions for german language sequence/senstence
def tokenize_de(text):
    return [token.text.lower() for token in spacy_de.tokenizer(text)]

In [23]:
# tokenization and lowercase functions for english language sequence
def tokenize_en(text):
    return [token.text.lower() for token in spacy_en.tokenizer(text)]

In [24]:
# It defines the file paths for your German–English parallel dataset
train_de_path="train.de.gz"
train_en_path="train.en.gz"
val_de_path="val.de.gz"
val_en_path="val.en.gz"
test_de_path="test_2016_flickr.de.gz"
test_en_path="test_2016_flickr.en.gz"

In [25]:
# Creating 3 custom data object (Train, validate and test)
# each data object reads the corresponding .gz German + English files, tokenize them using corresponding tokenizer
# and store stores paired token lists ({"src": [...], "trg": [...]})
train_data = Multi30kDataset(train_de_path, train_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)
val_data = Multi30kDataset(val_de_path, val_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)
test_data = Multi30kDataset(test_de_path, test_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)


# Define special tokens. These 4 special tokens are critical for Transformer-based NLP tasks
PAD_TOKEN = '<pad>'
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'
UNK_TOKEN = '<unk>'

In [26]:
# building a simple vocabulary from tokenized sentences.
# special tokens come first
def create_vocab(tokenized_sentences,special_tokens):
    vocab = {token: idx for idx, token in enumerate(special_tokens)}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

In [27]:
# It iterates through all german sentences, removes the extra whitespace and the apply german tokenization
# returned list of token of german sentences
train_de_tokenized = [tokenize_de(sentence.strip()) for sentence in train_data.src_data]

# It iterates through all englis sentences, removes the extra whitespace and the apply english tokenization
# returned list of token of English sentences
train_en_tokenized = [tokenize_en(sentence.strip()) for sentence in train_data.trg_data]

In [28]:
# List of tokens for first german senstence
print(train_de_tokenized[0])

['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']


In [29]:
# List of tokens for first englisg senstence
print(train_en_tokenized[0])

['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [30]:
# Create Two seperate vocabulary with special tokens for German and English Language
SRC_VOCAB = create_vocab(train_de_tokenized, [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN])
TRG_VOCAB = create_vocab(train_en_tokenized, [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN])

In [31]:
SRC_VOCAB

{'<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3,
 'zwei': 4,
 'junge': 5,
 'weiße': 6,
 'männer': 7,
 'sind': 8,
 'im': 9,
 'freien': 10,
 'in': 11,
 'der': 12,
 'nähe': 13,
 'vieler': 14,
 'büsche': 15,
 '.': 16,
 'mehrere': 17,
 'mit': 18,
 'schutzhelmen': 19,
 'bedienen': 20,
 'ein': 21,
 'antriebsradsystem': 22,
 'kleines': 23,
 'mädchen': 24,
 'klettert': 25,
 'spielhaus': 26,
 'aus': 27,
 'holz': 28,
 'mann': 29,
 'einem': 30,
 'blauen': 31,
 'hemd': 32,
 'steht': 33,
 'auf': 34,
 'einer': 35,
 'leiter': 36,
 'und': 37,
 'putzt': 38,
 'fenster': 39,
 'stehen': 40,
 'am': 41,
 'herd': 42,
 'bereiten': 43,
 'essen': 44,
 'zu': 45,
 'grün': 46,
 'hält': 47,
 'eine': 48,
 'gitarre': 49,
 ',': 50,
 'während': 51,
 'andere': 52,
 'sein': 53,
 'ansieht': 54,
 'lächelt': 55,
 'einen': 56,
 'ausgestopften': 57,
 'löwen': 58,
 'an': 59,
 'schickes': 60,
 'spricht': 61,
 'dem': 62,
 'handy': 63,
 'sie': 64,
 'langsam': 65,
 'die': 66,
 'straße': 67,
 'entlangschwebt': 68,
 'frau': 69,
 '

In [32]:
TRG_VOCAB

{'<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3,
 'two': 4,
 'young': 5,
 ',': 6,
 'white': 7,
 'males': 8,
 'are': 9,
 'outside': 10,
 'near': 11,
 'many': 12,
 'bushes': 13,
 '.': 14,
 'several': 15,
 'men': 16,
 'in': 17,
 'hard': 18,
 'hats': 19,
 'operating': 20,
 'a': 21,
 'giant': 22,
 'pulley': 23,
 'system': 24,
 'little': 25,
 'girl': 26,
 'climbing': 27,
 'into': 28,
 'wooden': 29,
 'playhouse': 30,
 'man': 31,
 'blue': 32,
 'shirt': 33,
 'is': 34,
 'standing': 35,
 'on': 36,
 'ladder': 37,
 'cleaning': 38,
 'window': 39,
 'at': 40,
 'the': 41,
 'stove': 42,
 'preparing': 43,
 'food': 44,
 'green': 45,
 'holds': 46,
 'guitar': 47,
 'while': 48,
 'other': 49,
 'observes': 50,
 'his': 51,
 'smiling': 52,
 'stuffed': 53,
 'lion': 54,
 'trendy': 55,
 'talking': 56,
 'her': 57,
 'cellphone': 58,
 'gliding': 59,
 'slowly': 60,
 'down': 61,
 'street': 62,
 'woman': 63,
 'with': 64,
 'large': 65,
 'purse': 66,
 'walking': 67,
 'by': 68,
 'gate': 69,
 'boys': 70,
 'dancing': 71,
 

In [33]:
# Transformers work parallelly. Transformers do not understand the order of words (unlike RNNs).
# So to let transformers know about the position of each word, we manually add information about word positions using a mathematical formula.

# creating class "PositionalEncoding" that inharits the PyTorch base NN
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000): # create contructor that accepts the embedding dimension (d_model) and max seq len, here it is by default 1000
        super().__init__() # initialize the parent class
        pe = torch.zeros(max_len, d_model) #Create empty matrix with size(max_len, d_model), this will store the positional encoding:[1000,512]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # Create the position tensor and add one dim at 1 shape will be [max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # divide term x^y = e^(ylog(x))
        pe[:, 0::2] = torch.sin(position * div_term) # PE for even term
        pe[:, 1::2] = torch.cos(position * div_term) # PE for odd term
        pe = pe.unsqueeze(0).transpose(0, 1) # Add one more dim at 0 and change 0->1 [max_len, d_model] -> [1, max_len, d_model]--> [mx_len, 1, d_model]
        ## register_buffer is used to save the tensor as part of the model's state_dict, but not as a model parameter.
        # so it will not be the part of training
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

In [34]:
# Multihead Attention which inharits PyTorch's base NN
class MultiHeadAttention(nn.Module):

    # initialize the constructor of multi-head attention module that has attributes d_model, and num_heads
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model # embedding dimension, in original paper 512
        self.num_heads = num_heads # number of attention heads, in original paper 8
        self.d_k = d_model // num_heads # Dimension of each head , as per original paper 512/8=64

        self.W_q = nn.Linear(d_model, d_model) # linear layer for query - Q matrix of size (d_model,d_model) - (512,512)
        self.W_k = nn.Linear(d_model, d_model) # linear layer for query - K matrix of size (d_model,d_model) - (512,512)
        self.W_v = nn.Linear(d_model, d_model) # linear layer for query - V matrix of size (d_model,d_model) - (512,512)
        self.W_o = nn.Linear(d_model, d_model) # linear layer for query - V matrix of size (d_model,d_model) - (512,512)

    # Calculates the self attention
    # Instead of creating 8 small layers of size 64, I create one big layer of size 512 (d_model).
    # I will logically split the output of this layer later in the forward method.
    # This is computationally more efficient.
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # apply mask if provided
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9) # mask out the positions with -inf
        attn_probs = torch.softmax(attn_scores, dim=-1) # softmax to get attention probabilities
        output = torch.matmul(attn_probs, V) # weighted sum of values
        return output # output of attention mechanism

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0) # get batch size

        # linear transformation and reshape for multi-head attention
        # W_q is applied to the entire d_model dimension, then reshaped to (batch_size, seq_len, num_heads, d_k)
        # Here, -1 infers the sequence length dimension automatically
        # Here, batch_size = Q.size(0)
        # here, d_k = d_model / num_heads
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)


        # Note: [batch_size, seq_len, d_model] --> [batch_size, seq_len, num_heads, head_dim]
        # and [batch_size, seq_len, num_heads, head_dim]--> [batch_size, num_heads, seq_len, head_dim]

        # Inputs: Q, K, V are tensors reshaped for multi-head processing.
        # [batch_size, seq_len, num_heads, head_dim] -->  [batch_size, num_heads, seq_len, head_dim]
        # Their shape is usually [Batch, Seq_Len, Head, Head_Dim]
        output = self.scaled_dot_product_attention(Q, K, V, mask)# apply scaled dot-product attention
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) # reshape back to original shape
        return self.W_o(output) # final linear transformation

In [35]:
# Position-wise Feed Forward Network
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff):      # intialize the constructor with attribut d_model and no of nodes in hidden layer
        super().__init__()                  # call parent class
        self.fc1 = nn.Linear(d_model, d_ff) # first linear layer
        self.fc2 = nn.Linear(d_ff, d_model) # second linear layer
        self.relu = nn.ReLU()               # activation function - ReLU


    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [36]:
# Encoding Section
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)   # Self attention calculation with MHA
        x = self.norm1(x + self.dropout(attn_output)) # Add and Norm
        ff_output = self.feed_forward(x)              # Feed forward Neural Network
        x = self.norm2(x + self.dropout(ff_output))   # Add and Norm
        return x

In [37]:
# Decoder Section

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, trg_mask):
        attn_output = self.self_attn(x, x, x, trg_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [38]:
# Transformer Class
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) # source embedding for src language - german
        self.decoder_embedding = nn.Embedding(trg_vocab_size, d_model) # trg embedding for trg language - english
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length) #creating PE object

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([d_model]))

    def generate_mask(self, src, trg):
        src_mask = (src != SRC_VOCAB[PAD_TOKEN]).unsqueeze(1).unsqueeze(2)
        trg_mask = (trg != TRG_VOCAB[PAD_TOKEN]).unsqueeze(1).unsqueeze(3)
        seq_length = trg.shape[1]
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        trg_mask = trg_mask & nopeak_mask
        return src_mask, trg_mask

    def forward(self, src, trg):
        src_mask, trg_mask = self.generate_mask(src, trg)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src) * self.scale))
        trg_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(trg) * self.scale))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = trg_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, trg_mask)

        output = self.fc_out(dec_output)
        return output

In [39]:
# Model hyperparameters from Paper "Attention is All you need"

SRC_VOCAB_SIZE = len(SRC_VOCAB)
TRG_VOCAB_SIZE = len(TRG_VOCAB)
D_MODEL = 512
NUM_HEADS = 8
NUM_LAYERS = 6
D_FF = 2048
MAX_SEQ_LENGTH = 100
DROPOUT = 0.1

In [40]:
# Initialize the model
model = Transformer(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, NUM_HEADS, NUM_LAYERS, D_FF, MAX_SEQ_LENGTH, DROPOUT)

In [41]:
print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

The model has 63,738,949 trainable parameters


In [42]:
# Define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
PAD_IDX = SRC_VOCAB[PAD_TOKEN]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [43]:
# Collate function for DataLoader
def collate_fn(batch):

    src_batch, trg_batch = [], []

    for sample in batch:
        src_batch.append(torch.tensor([SRC_VOCAB.get(token, SRC_VOCAB[UNK_TOKEN]) for token in [SOS_TOKEN] + sample['src'] + [EOS_TOKEN]]))
        trg_batch.append(torch.tensor([TRG_VOCAB.get(token, TRG_VOCAB[UNK_TOKEN]) for token in [SOS_TOKEN] + sample['trg'] + [EOS_TOKEN]]))

    src_batch = pad_sequence(src_batch, padding_value=SRC_VOCAB[PAD_TOKEN])
    trg_batch = pad_sequence(trg_batch, padding_value=TRG_VOCAB[PAD_TOKEN])

    return src_batch.transpose(0, 1), trg_batch.transpose(0, 1)

def train(model, iterator, optimizer, criterion, clip):
    model.train() # Sets the model to the tainning mode
    epoch_loss = 0 # initialize the running loss
    print(len(iterator))
    for src, trg in tqdm(iterator, desc="Training", leave=False): #Training Loop Over the Data Loader, for each iter a batch of src,trg and tqdm give progress bar
        optimizer.zero_grad() #Reset gradients

        output = model(src, trg[:, :-1]) #during training we use teacher forcing means inpur: <sos> i like coffee; outpu: <sos> i like coffee <eos>. means i want transformer to predict <eos>

        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch

            output = model(src, trg[:, :-1])

            output_dim = output.shape[-1]

            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()

    tokens = [SOS_TOKEN] + tokenize_de(sentence) + [EOS_TOKEN]

    src_indexes = [src_vocab.get(token, src_vocab[UNK_TOKEN]) for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    src_mask = model.generate_mask(src_tensor, src_tensor)

    with torch.no_grad():
        enc_src = model.encoder_embedding(src_tensor)
        for enc_layer in model.encoder_layers:
            enc_src = enc_layer(enc_src, src_mask[0])

    trg_indexes = [trg_vocab[SOS_TOKEN]]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.generate_mask(src_tensor, trg_tensor)

        with torch.no_grad():
            output = model.decoder_embedding(trg_tensor)
            for dec_layer in model.decoder_layers:
                output = dec_layer(output, enc_src, src_mask[0], trg_mask[1])
            output = model.fc_out(output)

        pred_token = output.argmax(2)[:,-1].item()

        trg_indexes.append(pred_token)

        if pred_token == trg_vocab[EOS_TOKEN]:
            break

    trg_tokens = [list(trg_vocab.keys())[list(trg_vocab.values()).index(i)] for i in trg_indexes]

    return trg_tokens[1:-1]


In [44]:
# Training loop
N_EPOCHS = 10
CLIP = 1.0
BATCH_SIZE = 32

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-translation-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

# Load the best model for evaluation
model.load_state_dict(torch.load('transformer-translation-model.pt'))

907


                                                           

Epoch: 01 | Time: 23.0m 10.955479383468628s
	Train Loss: 3.767 | Train PPL:  43.238
	 Val. Loss: 3.019 |  Val. PPL:  20.481
907


                                                           

Epoch: 02 | Time: 21.0m 53.172733306884766s
	Train Loss: 2.786 | Train PPL:  16.215
	 Val. Loss: 2.622 |  Val. PPL:  13.759
907


                                                           

Epoch: 03 | Time: 20.0m 51.00803804397583s
	Train Loss: 2.396 | Train PPL:  10.975
	 Val. Loss: 2.395 |  Val. PPL:  10.972
907


                                                           

Epoch: 04 | Time: 20.0m 52.148905992507935s
	Train Loss: 2.123 | Train PPL:   8.355
	 Val. Loss: 2.251 |  Val. PPL:   9.499
907


                                                           

Epoch: 05 | Time: 20.0m 50.884140729904175s
	Train Loss: 1.899 | Train PPL:   6.678
	 Val. Loss: 2.204 |  Val. PPL:   9.064
907


                                                           

Epoch: 06 | Time: 20.0m 45.63326358795166s
	Train Loss: 1.708 | Train PPL:   5.519
	 Val. Loss: 2.124 |  Val. PPL:   8.364
907


                                                           

Epoch: 07 | Time: 20.0m 47.67880916595459s
	Train Loss: 1.533 | Train PPL:   4.632
	 Val. Loss: 2.119 |  Val. PPL:   8.322
907


                                                           

Epoch: 08 | Time: 20.0m 52.00290060043335s
	Train Loss: 1.373 | Train PPL:   3.948
	 Val. Loss: 2.106 |  Val. PPL:   8.216
907


                                                           

Epoch: 09 | Time: 20.0m 51.1942675113678s
	Train Loss: 1.220 | Train PPL:   3.388
	 Val. Loss: 2.146 |  Val. PPL:   8.550
907


                                                           

Epoch: 10 | Time: 20.0m 51.257466316223145s
	Train Loss: 1.080 | Train PPL:   2.945
	 Val. Loss: 2.169 |  Val. PPL:   8.753


<All keys matched successfully>

In [45]:
# Example translations
for example_idx in range(3):  # Change the range to translate more examples
    src = test_data[example_idx]['src']
    trg = test_data[example_idx]['trg']

    print(f'Source: {" ".join(src)}')
    print(f'Target: {" ".join(trg)}')

    translation = translate_sentence(" ".join(src), SRC_VOCAB, TRG_VOCAB, model, torch.device('cpu' if torch.cuda.is_available() else 'cpu'))
    print(f'Predicted: {" ".join(translation)}')
    print()

Source: ein mann mit einem orangefarbenen hut , der etwas anstarrt .
Target: a man in an orange hat starring at something .
Predicted: a man in an orange hat taking something with something .

Source: ein boston terrier läuft über saftig-grünes gras vor einem weißen zaun .
Target: a boston terrier is running on lush green grass in front of a white fence .
Predicted: a lone rock climber runs over a white fence in front of a white fence .

Source: ein mädchen in einem karateanzug bricht ein brett mit einem tritt .
Target: a girl in karate uniform breaking a stick with a front kick .
Predicted: a girl in a karate uniform kicks a board in a board .



In [46]:
# Save the model
torch.save(model.state_dict(), 'final_transformer_translation_model.pt')
print("Model saved as 'final_transformer_translation_model.pt'")

Model saved as 'final_transformer_translation_model.pt'
