In [1]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
from torchtext.data import Field, BucketIterator
import torch.nn as nn
from torchtext.data import TabularDataset
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import pandas as pd
import wandb
import time

In [2]:
en_nlp = spacy.load("en_core_web_sm")
ro_nlp = spacy.load("ro_core_news_sm")

In [3]:
def tokenize_en(text):
    return [tok.text for tok in en_nlp(text)]

def tokenize_ro(text):
    return [tok.text for tok in ro_nlp(text)]

source_language = Field(tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
target_language = Field(tokenize_ro, init_token='<sos>', eos_token='<eos>', lower=True)

fields = [('english', source_language), ('romanian', target_language)]

In [4]:
# dataset =load_dataset("tatoeba", lang1="en", lang2="ro")
# data = [(item['translation']['en'], item['translation']['ro']) for item in dataset['train']]
# train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data, val_data, test_data = [], [], []
train_data_path = "data/train_data.csv"
val_data_path = "data/val_data.csv"
test_data_path = "data/test_data.csv"


train_data = TabularDataset(
    path=train_data_path, format='csv',
    fields=fields, skip_header=True 
)

val_data = TabularDataset(
    path=val_data_path, format='csv',
    fields=fields, skip_header=True
)

test_data = TabularDataset(
    path=test_data_path, format='csv',
    fields=fields, skip_header=True
)

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
print("Sample data:")
print("Train sample:", train_data.examples[0].english, train_data.examples[0].romanian)
print("Validation sample:", val_data.examples[0].english, val_data.examples[0].romanian)
print("Test sample:", test_data.examples[0].english, test_data.examples[0].romanian)


Train: 12145, Val: 1518, Test: 1519
Sample data:
Train sample: ['he', 'came', 'at', 'about', 'two', "o'clock."] ['el', 'a', 'venit', 'aproximativ', 'pe', 'la', 'două.']
Validation sample: ['no', "i'm", 'not;', 'you', 'are!'] ['nu,', 'nu', 'sunt;', 'tu', 'ești!']
Test sample: ['i', 'think', 'i', 'like', 'eating', 'white', 'rice', 'better', 'than', 'brown', 'rice.'] ['cred', 'că-mi', 'place', 'mai', 'mult', 'să', 'mănânc', 'orez', 'alb', 'decât', 'orez', 'brun.']


In [5]:
source_language.build_vocab(train_data, max_size=10000, min_freq=2)
target_language.build_vocab(train_data, max_size=10000, min_freq=2)

print(f"Source vocabulary size: {len(source_language.vocab)}")
print(f"Target vocabulary size: {len(target_language.vocab)}")

# show some vocabulary
print("Source vocabulary sample:", list(source_language.vocab.stoi.items())[:10])
print("Target vocabulary sample:", list(target_language.vocab.stoi.items())[:10])

Source vocabulary size: 4752
Target vocabulary size: 5450
Source vocabulary sample: [('<unk>', 0), ('<pad>', 1), ('<sos>', 2), ('<eos>', 3), ('the', 4), ('i', 5), ('to', 6), ('a', 7), ('is', 8), ('you', 9)]
Target vocabulary sample: [('<unk>', 0), ('<pad>', 1), ('<sos>', 2), ('<eos>', 3), ('de', 4), ('să', 5), ('nu', 6), ('a', 7), ('este', 8), ('tom', 9)]


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_rate):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_rate)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_rate)

    def forward(self, x):
        #shape (x) = (seq_len, batch_size)
        embedding =  self.dropout(self.embedding(x))
        #shape (embedding) = (seq_len, batch_size, embedding_size)

        _, (hidden, cell) = self.lstm(embedding)
         
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout_rate):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout_rate)
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        #shape (x) = (N)
        x= x.unsqueeze(0) # Add sequence dimension (1, N)

        embedding = self.dropout(self.embedding(x))
        #shape (embedding) = (1, N, embedding_size)
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        #shape (outputs) = (1, N, hidden_size)
        predictions = self.fc(outputs)
        #shape (predictions) = (1, N, output_size)
        predictions = predictions.squeeze(0)
        #shape (predictions) = (N, output_size)

        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        # shape (source) = (seq_len, batch_size)
        target_len = target.shape[0]
        target_vocab_size = len(target_language.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        # hidden, cell = self.encoder(source)
        (hidden, cell), encoder_outputs = self.encoder(source)

        # first token
        x = target[0]

        for t in range(1, target_len):
            # output, hidden, cell = self.decoder(x, hidden, cell)
            output, hidden, cell = self.decoder(x, hidden, cell, encoder_outputs)
            # shape (output) = (batch_size, target_vocab_size)
            outputs[t] = output

            # output shape (output) = (batch_size, target_vocab_size)
            best_guess = output.argmax(1)
            x = target[t] if torch.rand(1) < teacher_force_ratio else best_guess
        
        return outputs
    
class CnnEncoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, encoder_num_layers, dropout_rate, decoder_num_layers, max_seq_len=100, kernel_size=3):
        super(CnnEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.encoder_num_layers = encoder_num_layers
        self.decoder_num_layers = decoder_num_layers

        self.dropout = nn.Dropout(dropout_rate)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.pos_embedding = nn.Embedding(max_seq_len, embedding_size)

        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_size if i == 0 else hidden_size,
                       out_channels=hidden_size, kernel_size=kernel_size, padding=kernel_size//2)
            for i in range(encoder_num_layers)
        ])

        self.fc_hidden = nn.Linear(hidden_size, decoder_num_layers * hidden_size)
        self.fc_cell = nn.Linear(hidden_size, decoder_num_layers * hidden_size)
    def forward(self, x):
        # shape (x) = (seq_len, batch_size)
        embeddings = self.dropout(self.embedding(x))
        # shape (embeddings) = (seq_len, batch_size, embedding_size)

        positions = torch.arange(0, x.size(0), device=x.device).unsqueeze(1).repeat(1, x.size(1))
        pos_embeddings = self.dropout(self.pos_embedding(positions))
        # shape (pos_embeddings) = (seq_len, batch_size, embedding_size)

        embeddings = embeddings + pos_embeddings
        # shape (embeddings) = (seq_len, batch_size, embedding_size)

        embeddings = embeddings.permute(1, 2, 0)
        # shape (embeddings) = (batch_size, embedding_size, seq_len)

        conv_input = embeddings
        for i, conv in enumerate(self.conv_layers):
            conv_output = torch.relu(conv(conv_input))
            # shape (conv_output) = (batch_size, hidden_size, seq_len)

            #residual connection
            if i > 0:
                conv_output = self.dropout(conv_output) + conv_input
            else:
                conv_output = self.dropout(conv_output)
            conv_input = conv_output
            # shape (conv_output) = (batch_size, hidden_size, seq_len)
        
        '''This part is commented out because it was the original implementation, without attention.'''
        # # get rid of the sequence dimension
        # conv_output = conv_output.mean(dim=2)
        # # shape (conv_output) = (batch_size, hidden_size)

        # hidden = self.fc_hidden(conv_output)
        # cell = self.fc_cell(conv_output)

        # # reshape for lstm
        # hidden = hidden.view(self.decoder_num_layers, -1, self.hidden_size)
        # cell = cell.view(self.decoder_num_layers, -1, self.hidden_size)

        conv_output_perm = conv_output.permute(2, 0, 1)
        # shape (conv_output_perm) = (seq_len, batch_size, hidden_size)
        conv_output_mean = conv_output_perm.mean(dim=0)
        # shape (conv_output_mean) = (batch_size, hidden_size)

        hidden = self.fc_hidden(conv_output_mean)
        cell = self.fc_cell(conv_output_mean)

        # reshape for lstm
        hidden = hidden.view(self.decoder_num_layers, -1, self.hidden_size)
        cell = cell.view(self.decoder_num_layers, -1, self.hidden_size)
    
        return (hidden, cell), conv_output_perm

class Attention(nn.Module): # Luong concatenation attention 
    def __init__(self, encoder_hidden_size, decoder_hidden_size):
        super().__init__()
        self.attn = nn.Linear(encoder_hidden_size + decoder_hidden_size, decoder_hidden_size)
        self.v = nn.Linear(decoder_hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [num_layers, batch_size, decoder_hidden_size]
        # encoder_outputs: [src_len, batch_size, encoder_hidden_size]

        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        # hidden_last_layer shape: [1, batch_size, decoder_hidden_size]
        hidden_last_layer = hidden[-1, :, :].unsqueeze(0) 

        # repeat hidden_last_layer for each step in the source sequence
        # hidden_repeated shape: [src_len, batch_size, decoder_hidden_size]
        hidden_repeated = hidden_last_layer.repeat(src_len, 1, 1)

        # hidden_repeated + encoder_outputs
        # combined shape: [src_len, batch_size, encoder_hidden_size + decoder_hidden_size]
        combined = torch.cat((hidden_repeated, encoder_outputs), dim=2)

        # calculate energy
        # energy shape: [src_len, batch_size, decoder_hidden_size]
        energy = torch.tanh(self.attn(combined))

        # project energy vector to a single score
        # attention shape: [src_len, batch_size, 1]
        attention = self.v(energy)
        
        # attention shape: [src_len, batch_size]
        attention = attention.squeeze(2)

        # attention_weights shape: [src_len, batch_size]
        attention_weights = F.softmax(attention, dim=0)

        return attention_weights

class AttentionDecoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout_rate, encoder_hidden_size):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.encoder_hidden_size = encoder_hidden_size 

        self.dropout = nn.Dropout(dropout_rate)
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_rate)
    
        self.attention = Attention(encoder_hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size + encoder_hidden_size, output_size)

    def forward(self, x, hidden, cell, encoder_outputs):
        # x: (N) - tokenul curent (batch de tokenuri)
        # hidden: (num_layers, N, hidden_size)
        # cell: (num_layers, N, hidden_size)
        # encoder_outputs: (src_len, N, encoder_hidden_size) 

        x = x.unsqueeze(0) # shape (1, N)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        attention_weights = self.attention(hidden, encoder_outputs)

        # apply attention_weights to encoder_outputs to get context vector
        # encoder_outputs shape: (src_len, N, encoder_hidden_size)
        # attention_weights shape: (src_len, N)
        # context_vector shape: (N, encoder_hidden_size)
        context_vector = torch.sum(attention_weights.unsqueeze(2) * encoder_outputs, dim=0)

        # combine the LSTM output and context vector
        # outputs.squeeze(0) shape: (N, hidden_size)
        # context_vector shape: (N, encoder_hidden_size)
        combined_input = torch.cat((outputs.squeeze(0), context_vector), dim=1)
        # combined_input shape: (N, hidden_size + encoder_hidden_size)

        predictions = self.fc(combined_input)
        # predictions shape: (N, output_size)

        return predictions, hidden, cell

In [7]:
num_epochs = 10
lr = 0.001
batch_size = 64

load_model = False
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
input_size_encoder = len(source_language.vocab)
input_size_decoder = len(target_language.vocab)
output_size  = len(target_language.vocab)
embedding_size = 256
hidden_size = 1024
num_layers = 2
enc_dropout_rate = 0.5
dec_dropout_rate = 0.5
cnn_encoder_layers = 5
max_seq_len = 100


train_iterator, val_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.english),
    device=device
)

# encoder = Encoder(input_size_encoder, embedding_size, hidden_size, num_layers, enc_dropout_rate).to(device)
# decoder = Decoder(output_size, embedding_size, hidden_size, num_layers, dec_dropout_rate).to(device)
cnn_encoder = CnnEncoder(input_size_encoder, embedding_size, hidden_size, cnn_encoder_layers, enc_dropout_rate, num_layers, max_seq_len).to(device)
attn_decoder = AttentionDecoder(output_size, embedding_size, hidden_size, num_layers, dec_dropout_rate, hidden_size).to(device)
model = Seq2Seq(cnn_encoder, attn_decoder).to(device)




pad_idx = target_language.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)



In [8]:
def train_model(model, train_iterator, val_iterator, optimizer, criterion, source_language, target_language, num_epochs):
    wandb.init(project="nmt_training")

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0

        for i, batch in enumerate(train_iterator):
            source = batch.english
            target = batch.romanian
            source, target = source.to(device), target.to(device)

            output = model(source, target)
             # output shape (target_len, batch_size, output_size)
            # target shape (target_len, batch_size)

            #ignore the first token in target (sos)
            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()

        valid_loss, valid_bleu = evaluate(model, val_iterator, criterion, source_language, target_language)
        avg_train_loss = epoch_loss / len(train_iterator)

        print(f'Epoch: {epoch+1:02} | Train Loss: {avg_train_loss:.3f} | Val Loss: {valid_loss:.3f} | BLEU: {valid_bleu:.2f}')

        wandb.log({
            "train_loss": avg_train_loss,
            "val_loss": valid_loss,
            "val_bleu": valid_bleu,
            "epoch": epoch + 1
        })

def evaluate(model, iterator, criterion, source_language, target_language):
    model.eval()
    epoch_loss = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for batch in iterator:
            source = batch.english.to(device)
            target = batch.romanian.to(device)

            output = model(source, target, teacher_force_ratio=0.0)
            output_dim = output.shape[-1]

            # output: (target_len, batch_size, output_dim)
            # target: (target_len, batch_size)
            output_tokens = output.argmax(2)  # shape: (target_len, batch_size)

            for i in range(target.shape[1]): 
                pred_tokens = output_tokens[1:, i]  # skip <sos>
                trg_tokens = target[1:, i]

                # Cut at <eos> 
                pred_sentence = []
                for tok in pred_tokens:
                    word = target_language.vocab.itos[tok.item()]
                    if word == '<eos>':
                        break
                    pred_sentence.append(word)

                trg_sentence = []
                for tok in trg_tokens:
                    word = target_language.vocab.itos[tok.item()]
                    if word == '<eos>':
                        break
                    trg_sentence.append(word)

                all_predictions.append(pred_sentence)
                all_targets.append([trg_sentence])  # wrapped in list for BLEU

            output_flat = output[1:].view(-1, output_dim)
            target_flat = target[1:].view(-1)
            loss = criterion(output_flat, target_flat)
            epoch_loss += loss.item()

    bleu = bleu_score(all_predictions, all_targets) * 100
    return epoch_loss / len(iterator), bleu


    

In [15]:
train_model(model, train_iterator, val_iterator, optimizer, criterion, source_language, target_language, num_epochs=30)

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▆▅▄▃▃▂▂▁▁
val_bleu,▁▃▅▆▆▆▆▆▆█
val_loss,█▄▂▂▁▂▁▁▂▁

0,1
epoch,10.0
train_loss,2.60353
val_bleu,8.50428
val_loss,4.34718


Epoch: 01 | Train Loss: 2.451 | Val Loss: 4.362 | BLEU: 8.81
Epoch: 02 | Train Loss: 2.310 | Val Loss: 4.441 | BLEU: 8.92
Epoch: 03 | Train Loss: 2.194 | Val Loss: 4.465 | BLEU: 8.44
Epoch: 04 | Train Loss: 2.081 | Val Loss: 4.474 | BLEU: 9.16
Epoch: 05 | Train Loss: 1.996 | Val Loss: 4.489 | BLEU: 9.74
Epoch: 06 | Train Loss: 1.900 | Val Loss: 4.540 | BLEU: 8.95
Epoch: 07 | Train Loss: 1.806 | Val Loss: 4.625 | BLEU: 9.33
Epoch: 08 | Train Loss: 1.743 | Val Loss: 4.624 | BLEU: 9.54
Epoch: 09 | Train Loss: 1.672 | Val Loss: 4.617 | BLEU: 10.14
Epoch: 10 | Train Loss: 1.639 | Val Loss: 4.600 | BLEU: 10.21
Epoch: 11 | Train Loss: 1.562 | Val Loss: 4.674 | BLEU: 10.03
Epoch: 12 | Train Loss: 1.572 | Val Loss: 4.584 | BLEU: 10.32
Epoch: 13 | Train Loss: 1.498 | Val Loss: 4.651 | BLEU: 10.33
Epoch: 14 | Train Loss: 1.452 | Val Loss: 4.691 | BLEU: 10.28
Epoch: 15 | Train Loss: 1.402 | Val Loss: 4.721 | BLEU: 11.14
Epoch: 16 | Train Loss: 1.385 | Val Loss: 4.725 | BLEU: 11.29
Epoch: 17 | Trai

In [None]:
def translate_sentence(sentence, model, source_field, target_field, device, max_len=50):
    """
    Translates a single English sentence to Romanian using the trained Seq2Seq model with attention.

    Args:
        sentence (str): The input English sentence to translate.
        model (nn.Module): The trained Seq2Seq model.
        source_field (torchtext.legacy.data.Field): The Field object for the source language.
        target_field (torchtext.legacy.data.Field): The Field object for the target language.
        device (torch.device): The device (CPU or GPU) the model is on.
        max_len (int): The maximum length of the translated sentence to prevent infinite loops.

    Returns:
        list: A list of translated tokens (words).
    """
    model.eval() # Set the model to evaluation mode

    # Tokenize the input sentence
    if source_field.lower:
        tokens = [token.lower() for token in source_field.tokenize(sentence)]
    else:
        tokens = source_field.tokenize(sentence)

    tokens = [source_field.init_token] + tokens + [source_field.eos_token]
    src_indexes = [source_field.vocab.stoi.get(token, source_field.vocab.stoi[source_field.unk_token]) for token in tokens]
    
    # Convert to PyTorch tensor and add batch dimension (batch_size=1)
    # src_tensor shape: (seq_len, 1)
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    
    with torch.no_grad():
        # MODIFIED: Encoder now returns (hidden, cell) AND encoder_outputs
        (hidden, cell), encoder_outputs = model.encoder(src_tensor)
    
    # Prepare the first input to the decoder: <sos> token
    # trg_indexes will store the predicted token IDs
    trg_indexes = [target_field.vocab.stoi[target_field.init_token]]
    
    # Decoding loop
    for _ in range(max_len): # Use max_len parameter to prevent infinite loops
        # trg_tensor shape: (1) - current input token for the decoder
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        
        # MODIFIED: Pass encoder_outputs to the decoder
        output, hidden, cell = model.decoder(trg_tensor, hidden, cell, encoder_outputs)
        
        # Get the predicted next word (token with highest probability)
        # output shape: (1, target_vocab_size) -> argmax(1) gives the index
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        
        # If the model predicts an <eos> token, stop decoding
        if pred_token == target_field.vocab.stoi[target_field.eos_token]:
            break
    
    # Convert predicted IDs back to words
    trg_tokens = [target_field.vocab.itos[i] for i in trg_indexes]
    
    # Exclude <sos> token at the beginning and <eos> token at the end if present
    if trg_tokens and trg_tokens[0] == target_field.init_token:
        trg_tokens = trg_tokens[1:]
    if trg_tokens and trg_tokens[-1] == target_field.eos_token:
        trg_tokens = trg_tokens[:-1]

    return trg_tokens

translated_sentence = translate_sentence(
    "he is very strong",
    model,
    source_language,
    target_language,
    device
)
print("Translated sentence:", " ".join(translated_sentence))


Translated sentence: el este foarte tristă.
