<a href="https://colab.research.google.com/github/enxo7899/INM706-MachineTranslation/blob/main/INM706_Seq2Seq_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sacremoses
!pip install wandb

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-a

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import MarianTokenizer, MarianMTModel
import torch.nn as nn
import torch.optim as optim
import random
import math
import time
import wandb
import os

# Set the notebook name
os.environ["WANDB_NOTEBOOK_NAME"] = "INM706-Seq2Seq_Machine_Translation.ipynb"

# Login with the API KEY
wandb.login(key="9ce954fd827fd8d839648cb3708ff788ad51bafa")

# Initialize wandb run
wandb.init(project='Translator', name='English-Albanian')

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load the dataset
with open('GlobalVoices.en-sq.en', 'r', encoding='utf-8') as f:
    en_sentences = f.readlines()
with open('GlobalVoices.en-sq.sq', 'r', encoding='utf-8') as f:
    sq_sentences = f.readlines()

# Verify dataset loaded correctly
print(f"English sentences sample: {en_sentences[:5]}")
print(f"Albanian sentences sample: {sq_sentences[:5]}")
print(f"Total number of sentence pairs: {len(en_sentences)}")

# Use MarianTokenizer for tokenization
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-sq')

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, trg_sentences, tokenizer, max_length=128):
        self.src_sentences = src_sentences
        self.trg_sentences = trg_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.src_sentences[idx]
        trg = self.trg_sentences[idx]

        src_enc = self.tokenizer.encode_plus(
            src,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        trg_enc = self.tokenizer.encode_plus(
            trg,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'src': src_enc['input_ids'].squeeze(),
            'src_mask': src_enc['attention_mask'].squeeze(),
            'trg': trg_enc['input_ids'].squeeze(),
            'trg_mask': trg_enc['attention_mask'].squeeze()
        }

# Create the dataset objects
dataset = TranslationDataset(en_sentences, sq_sentences, tokenizer)

# Split the dataset into train and validation sets (90% train, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("Data preprocessing complete.")

# Define the Seq2Seq model components
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        hidden = hidden.unsqueeze(0).repeat(2, 1, 1)
        cell = cell[-2:].contiguous()
        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((hidden_dim * 2) + emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear((hidden_dim * 2) + hidden_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden[-1], encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1
        return outputs

# Model hyperparameters
INPUT_DIM = tokenizer.vocab_size
OUTPUT_DIM = tokenizer.vocab_size
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Initialize wandb configuration
wandb.config.update({
    "learning_rate": 1e-3,
    "epochs": 10,
    "batch_size": 32,
    "encoder_embedding_dim": ENC_EMB_DIM,
    "decoder_embedding_dim": DEC_EMB_DIM,
    "hidden_dim": HID_DIM,
    "num_layers": N_LAYERS,
    "encoder_dropout": ENC_DROPOUT,
    "decoder_dropout": DEC_DROPOUT
})

# Initialize encoder, attention, decoder, and seq2seq model
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
attn = Attention(HID_DIM).to(device)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attn).to(device)
model = Seq2Seq(enc, dec, device).to(device)

# Loss and optimizer
optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
TRG_PAD_IDX = tokenizer.pad_token_id
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for i, batch in enumerate(iterator):
        src = batch['src'].T.to(device)
        trg = batch['trg'].T.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

        # Calculate accuracy
        preds = output.argmax(1)
        non_pad_elements = (trg != TRG_PAD_IDX).nonzero().squeeze()
        correct = preds[non_pad_elements].eq(trg[non_pad_elements]).sum().item()
        acc = correct / len(non_pad_elements)
        epoch_acc += acc

        # Log metrics to wandb
        wandb.log({"batch_loss": loss.item(), "batch_accuracy": acc})

        # Print some batches
        if i % 10 == 0:
            print(f'Batch {i} | Loss: {loss.item():.3f} | Accuracy: {acc:.3f}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch['src'].T.to(device)
            trg = batch['trg'].T.to(device)
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()

            # Calculate accuracy
            preds = output.argmax(1)
            non_pad_elements = (trg != TRG_PAD_IDX).nonzero().squeeze()
            correct = preds[non_pad_elements].eq(trg[non_pad_elements]).sum().item()
            acc = correct / len(non_pad_elements)
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = wandb.config.epochs
CLIP = 1

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss, valid_acc = evaluate(model, val_loader, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train Acc: {train_acc:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} |  Val. Acc: {valid_acc:.3f}')

    # Log epoch metrics to wandb
    wandb.log({"train_loss": train_loss, "train_accuracy": train_acc,
               "valid_loss": valid_loss, "valid_accuracy": valid_acc,
               "epoch": epoch + 1, "epoch_time_mins": epoch_mins, "epoch_time_secs": epoch_secs})


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33menxo7899[0m ([33mem-city[0m). Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda
English sentences sample: ['South Korea: North Korean Dictator, Kim Jong Il Is Dead · Global Voices\n', 'Kim Jong Il, the North Korean dictator who ruled the hermit kingdom for the past three decades, has died at the age of 69.\n', 'According to North Korean state television\'s official report on Monday, Kim passed away from "mental and physical strain" during a train ride on December 17, 2011.\n', 'The South Korean Twittersphere erupted with various responses.\n', "Although the death of one of the world's most notorious dictators is something people might welcome, most South Koreans have expressed concern about the instability his sudden death might bring to Korean peninsula.\n"]
Albanian sentences sample: ['Kore: Vdes diktatori koreano-verior, Kim Jong Il\n', 'Kim Jong Il, diktatori koreano-verior, i cili sundoi me mbretërinë e izoluar gjatë tre dekadave të kaluara, vdiq në moshën 69 vjeçare.\n', 'Sipas lajmit zyrtar të emituar ditën e hënë në televizionin shtetëro

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/822k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Data preprocessing complete.
Batch 0 | Loss: 11.030 | Accuracy: 0.000
Batch 10 | Loss: 6.200 | Accuracy: 0.077
Batch 20 | Loss: 6.034 | Accuracy: 0.084
Batch 30 | Loss: 5.932 | Accuracy: 0.082
Batch 40 | Loss: 6.256 | Accuracy: 0.063
Batch 50 | Loss: 5.770 | Accuracy: 0.095
Batch 60 | Loss: 5.713 | Accuracy: 0.093
Batch 70 | Loss: 5.755 | Accuracy: 0.098
Batch 80 | Loss: 5.541 | Accuracy: 0.114
Batch 90 | Loss: 5.758 | Accuracy: 0.102
Batch 100 | Loss: 5.529 | Accuracy: 0.112
Batch 110 | Loss: 5.575 | Accuracy: 0.113
Batch 120 | Loss: 5.692 | Accuracy: 0.090
Batch 130 | Loss: 5.780 | Accuracy: 0.101
Batch 140 | Loss: 5.778 | Accuracy: 0.083
Batch 150 | Loss: 5.531 | Accuracy: 0.121
Batch 160 | Loss: 5.422 | Accuracy: 0.111
Epoch: 01 | Time: 3m 23s
	Train Loss: 5.896 | Train PPL: 363.687 | Train Acc: 0.091
	 Val. Loss: 5.980 |  Val. PPL: 395.319 |  Val. Acc: 0.069
Batch 0 | Loss: 5.687 | Accuracy: 0.101
Batch 10 | Loss: 5.707 | Accuracy: 0.102
Batch 20 | Loss: 5.445 | Accuracy: 0.120
Ba

In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import MarianTokenizer, MarianMTModel
import torch.nn as nn
import torch.optim as optim
import random
import math
import time
import wandb
import os
from torchtext.data.metrics import bleu_score

# Set the notebook name
os.environ["WANDB_NOTEBOOK_NAME"] = "INM706-Seq2Seq_Machine_Translation.ipynb"

# Login with the API KEY
wandb.login(key="9ce954fd827fd8d839648cb3708ff788ad51bafa")

# Initialize wandb run
wandb.init(project='Translator', name='English-Albanian')

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load the dataset
with open('GlobalVoices.en-sq.en', 'r', encoding='utf-8') as f:
    en_sentences = f.readlines()
with open('GlobalVoices.en-sq.sq', 'r', encoding='utf-8') as f:
    sq_sentences = f.readlines()

# Verify dataset loaded correctly
print(f"English sentences sample: {en_sentences[:5]}")
print(f"Albanian sentences sample: {sq_sentences[:5]}")
print(f"Total number of sentence pairs: {len(en_sentences)}")

# Use MarianTokenizer for tokenization
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-sq')

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, trg_sentences, tokenizer, max_length=128):
        self.src_sentences = src_sentences
        self.trg_sentences = trg_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = self.src_sentences[idx]
        trg = self.trg_sentences[idx]

        src_enc = self.tokenizer.encode_plus(
            src,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        trg_enc = self.tokenizer.encode_plus(
            trg,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'src': src_enc['input_ids'].squeeze(),
            'src_mask': src_enc['attention_mask'].squeeze(),
            'trg': trg_enc['input_ids'].squeeze(),
            'trg_mask': trg_enc['attention_mask'].squeeze()
        }

# Create the dataset objects
dataset = TranslationDataset(en_sentences, sq_sentences, tokenizer)

# Split the dataset into train and validation sets (90% train, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print("Data preprocessing complete.")

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))

        hidden = hidden.unsqueeze(0).repeat(2, 1, 1)
        cell = cell[-2:].contiguous()

        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((hidden_dim * 2) + emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear((hidden_dim * 2) + hidden_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden[-1], encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))

        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1

        return outputs

# Model hyperparameters
INPUT_DIM = tokenizer.vocab_size
OUTPUT_DIM = tokenizer.vocab_size
ENC_EMB_DIM = 512
DEC_EMB_DIM = 512
HID_DIM = 1024
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

wandb.config.update({
    "learning_rate": 1e-3,
    "epochs": 30,
    "batch_size": 64,
    "encoder_embedding_dim": ENC_EMB_DIM,
    "decoder_embedding_dim": DEC_EMB_DIM,
    "hidden_dim": HID_DIM,
    "num_layers": N_LAYERS,
    "encoder_dropout": ENC_DROPOUT,
    "decoder_dropout": DEC_DROPOUT
})

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
attn = Attention(HID_DIM).to(device)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attn).to(device)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
TRG_PAD_IDX = tokenizer.pad_token_id
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

def train(model, iterator, optimizer, criterion, clip, accum_steps=2):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    optimizer.zero_grad()

    for i, batch in enumerate(iterator):
        src = batch['src'].T.to(device)
        trg = batch['trg'].T.to(device)

        with autocast():
            output = model(src, trg)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].reshape(-1)
            loss = criterion(output, trg) / accum_steps

        scaler.scale(loss).backward()

        if (i + 1) % accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        epoch_loss += loss.item() * accum_steps

        preds = output.argmax(1)
        non_pad_elements = (trg != TRG_PAD_IDX).nonzero().squeeze()
        correct = preds[non_pad_elements].eq(trg[non_pad_elements]).sum().item()
        acc = correct / len(non_pad_elements)
        epoch_acc += acc

        wandb.log({"batch_loss": loss.item() * accum_steps, "batch_accuracy": acc})

        if i % 10 == 0:
            print(f'Batch {i} | Loss: {loss.item() * accum_steps:.3f} | Accuracy: {acc:.3f}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_trgs = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch['src'].T.to(device)
            trg = batch['trg'].T.to(device)
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()

            preds = output.argmax(1)
            non_pad_elements = (trg != TRG_PAD_IDX).nonzero().squeeze()
            correct = preds[non_pad_elements].eq(trg[non_pad_elements]).sum().item()
            acc = correct / len(non_pad_elements)
            epoch_acc += acc

            all_preds.append(preds.cpu().numpy())
            all_trgs.append(trg.cpu().numpy())

    all_preds = [list(map(str, sent)) for sent in all_preds]
    all_trgs = [list(map(str, sent)) for sent in all_trgs]
    bleu = bleu_score(all_preds, [[trg] for trg in all_trgs])

    return epoch_loss / len(iterator), epoch_acc / len(iterator), bleu

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

if not os.path.exists('checkpoints'):
    os.makedirs('checkpoints')

N_EPOCHS = wandb.config.epochs
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss, valid_acc, bleu = evaluate(model, val_loader, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'checkpoints/seq2seq_model_epoch{epoch+1}.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train Acc: {train_acc:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} |  Val. Acc: {valid_acc:.3f} |  Val. BLEU: {bleu:.3f}')

    wandb.log({"train_loss": train_loss, "train_accuracy": train_acc,
               "valid_loss": valid_loss, "valid_accuracy": valid_acc, "valid_bleu": bleu,
               "epoch": epoch + 1, "epoch_time_mins": epoch_mins, "epoch_time_secs": epoch_secs})

wandb.finish()


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Using device: cuda
English sentences sample: ['South Korea: North Korean Dictator, Kim Jong Il Is Dead · Global Voices\n', 'Kim Jong Il, the North Korean dictator who ruled the hermit kingdom for the past three decades, has died at the age of 69.\n', 'According to North Korean state television\'s official report on Monday, Kim passed away from "mental and physical strain" during a train ride on December 17, 2011.\n', 'The South Korean Twittersphere erupted with various responses.\n', "Although the death of one of the world's most notorious dictators is something people might welcome, most South Koreans have expressed concern about the instability his sudden death might bring to Korean peninsula.\n"]
Albanian sentences sample: ['Kore: Vdes diktatori koreano-verior, Kim Jong Il\n', 'Kim Jong Il, diktatori koreano-verior, i cili sundoi me mbretërinë e izoluar gjatë tre dekadave të kaluara, vdiq në moshën 69 vjeçare.\n', 'Sipas lajmit zyrtar të emituar ditën e hënë në televizionin shtetëro

VBox(children=(Label(value='0.002 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.17145983119724914, max=1.…

0,1
batch_accuracy,▁▂▃▄▄▄▃▃▅▅▅▃▄▃▃▅▅▆▇▆▆▅▆▆▅▇▆▆▇▆▆▆▇▅▆▆█▇▇▇
batch_loss,█▆▅▅▅▅▆▅▄▃▄▅▄▄▄▄▄▃▃▃▃▃▃▃▄▃▂▃▂▂▂▃▂▃▂▂▁▁▂▂
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_time_mins,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_time_secs,▃▃▃▃▆███▃▃▆▆▃▃▁▁▁▃▁▃▁▃▁▃▃▃▃▃▃▃
train_accuracy,▁▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
train_loss,█▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
valid_accuracy,▃▁▅▃▄▂▂▄▄▆▅▇▆▆▆▅█▄▇▅▇▄▅▆▇▇▇▆▇▆
valid_bleu,▁▂▁▂▃▄▅▃▃▆▄▅▅▅▆▅▄█▆▆▆▆▆▇▇█▇█▇█
valid_loss,▅▅▄▄▄█▆▃▄▃▃▂▂▂▂▂▁▂▁▂▁▂▂▂▁▂▁▂▂▂

0,1
batch_accuracy,0.22593
batch_loss,4.54598
epoch,30.0
epoch_time_mins,5.0
epoch_time_secs,7.0
train_accuracy,0.2142
train_loss,4.31009
valid_accuracy,0.07438
valid_bleu,0.00717
valid_loss,5.9134


Inference

In [None]:
def translate_sentence(sentence, tokenizer, model, device, max_len=50):
    model.eval()

    # Tokenize the sentence
    tokens = tokenizer.encode(sentence, return_tensors='pt', max_length=max_len, truncation=True, padding='max_length').to(device)

    # Perform inference
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(tokens.T)

    # Prepare the input and output tensors
    trg_indexes = [tokenizer.pad_token_id]
    trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device)

    for i in range(max_len):
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor[-1], hidden, cell, encoder_outputs)

        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == tokenizer.pad_token_id:
            break

        trg_tensor = torch.cat((trg_tensor, torch.LongTensor([pred_token]).unsqueeze(1).to(device)), dim=0)

    trg_tokens = tokenizer.decode(trg_indexes, skip_special_tokens=True)
    return trg_tokens

# Example usage:
src_sentence = "How are you?"
translated_sentence = translate_sentence(src_sentence, tokenizer, model, device)
print(f"Translated Sentence: {translated_sentence}")


Translated Sentence: ter: Mbëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëëë
