In [None]:
import torch
from torchtext.data import Field
import spacy
import nltk
from nltk.translate.bleu_score import sentence_bleu
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from model.rescnn_bilstm import ResCnnBiLstm

In [3]:
from datasets import load_dataset

dataset = load_dataset("wmt16", "ro-en")
def map_language_data(data, lang):
    items = []
    for item in data:
        items.append(item[lang])
    return items

train_data = {'en': map_language_data(dataset['train']['translation'], 'en'), 'ro': map_language_data(dataset['train']['translation'], 'ro')}
val_data = {'en': map_language_data(dataset['validation']['translation'], 'en'), 'ro': map_language_data(dataset['validation']['translation'], 'ro')}
test_data = {'en': map_language_data(dataset['test']['translation'], 'en'), 'ro': map_language_data(dataset['test']['translation'], 'ro')}

# train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
# val_loader = torch.utils.data.DataLoader(val_data, batch_size=32, shuffle=False)
# test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=False)

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_field, tgt_field):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_field = src_field
        self.tgt_field = tgt_field

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        return self.src_sentences[idx], self.tgt_sentences[idx]
def collate_fn(batch):
    src_sentences, tgt_sentences = zip(*batch)
    src_tensor = SRC.process(src_sentences)
    tgt_tensor = TGT.process(tgt_sentences)
    return src_tensor, tgt_tensor



In [4]:
from torchtext.data.utils import get_tokenizer

SRC = Field(tokenize=get_tokenizer("spacy", language="en_core_web_sm"), init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
TGT = Field(tokenize=get_tokenizer("spacy", language="ro_core_news_sm"), init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)

# Build vocab after defining the fields
SRC.build_vocab(train_data['en'], min_freq=2)
TGT.build_vocab(train_data['ro'], min_freq=2)

train_dataset = TranslationDataset(train_data['en'], train_data['ro'], SRC, TGT)
val_dataset = TranslationDataset(val_data['en'], val_data['ro'], SRC, TGT)
test_dataset = TranslationDataset(test_data['en'], test_data['ro'], SRC, TGT)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

print(f"Source vocab size: {len(SRC.vocab)}")
print(f"Target vocab size: {len(TGT.vocab)}")

Source vocab size: 249
Target vocab size: 266


In [5]:
def compute_bleu(reference_corpus, predicted_corpus):
    """
    Computes average BLEU score over a batch of sentences.
    Args:
        reference_corpus: List of tokenized reference sentences.
        predicted_corpus: List of tokenized predicted sentences.
    Returns:
        BLEU score (0 to 100)
    """
    bleu_scores = []
    for ref, pred in zip(reference_corpus, predicted_corpus):
        score = sentence_bleu([ref], pred, weights=(0.5, 0.5))  
        bleu_scores.append(score)
    return sum(bleu_scores) / len(bleu_scores) * 100


In [6]:
def evaluate(model, val_loader, criterion, tgt_vocab, device):
    model.eval()
    total_loss = 0
    predicted_sentences = []
    reference_sentences = []

    tgt_itos = {i: tok for tok, i in tgt_vocab.stoi.items()}

    with torch.no_grad():
        for src_batch, tgt_batch in val_loader:
            src_batch = src_batch.to(device)
            tgt_batch = tgt_batch.to(device)

            output = model(src_batch, tgt_batch[:, :-1])
            output_flat = output.reshape(-1, output.size(-1))
            target = tgt_batch[:, 1:].reshape(-1)

            loss = criterion(output_flat, target)
            total_loss += loss.item()

            preds = output.argmax(-1)  # (batch, tgt_len)
            for pred, ref in zip(preds, tgt_batch):
                pred_tokens = [tgt_itos[idx.item()] for idx in pred if idx.item() not in {tgt_vocab["<pad>"], tgt_vocab["<eos>"]}]
                ref_tokens = [tgt_itos[idx.item()] for idx in ref if idx.item() not in {tgt_vocab["<pad>"], tgt_vocab["<eos>"]}]
                predicted_sentences.append(pred_tokens)
                reference_sentences.append(ref_tokens)

    avg_loss = total_loss / len(val_loader)
    bleu = compute_bleu(reference_sentences, predicted_sentences)
    return avg_loss, bleu


In [7]:
import wandb
from torch.nn.utils import clip_grad_norm_

PAD_IDX = TGT.vocab.stoi["<pad>"]

def train_model(model, train_loader, val_loader, optimizer, src_field, tgt_field,
                device, num_epochs=10, clip=1.0):

    wandb.init(project="cnn2rnn-translation", config={"epochs": num_epochs})
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_loss = 0
        print(f"Epoch {epoch}/{num_epochs}")
        print("-" * 20)

        for i, (src_batch, tgt_batch) in enumerate(train_loader):
            src_batch = src_batch.to(device)
            tgt_batch = tgt_batch.to(device)

            optimizer.zero_grad()

            print(f"Batch {i+1}/{len(train_loader)}")
            print("-" * 20)
            
            output = model(src_batch, tgt_batch[:, :-1])  # teacher forcing

            print(f"Output shape: {output.shape}")
            print(f"Target shape: {tgt_batch[:, 1:].shape}")

            output_contiguous = output.contiguous().to(device)
            output_reshaped = output_contiguous.reshape(-1, output.size(-1))

            target_contiguous = tgt_batch[:, 1:].contiguous().clone()
            target_reshaped = target_contiguous.reshape(-1)

            loss = criterion(output_reshaped, target_reshaped)
            loss.backward()
            print (f"Loss: {loss.item()}")
            clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        val_loss, bleu = evaluate(model, val_loader, criterion, tgt_field, device)

        wandb.log({
            "epoch": epoch,
            "train_loss": avg_train_loss,
            "val_loss": val_loss,
            "BLEU": bleu
        })

        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Val Loss = {val_loss:.4f}, BLEU = {bleu:.2f}")

In [8]:
model = ResCnnBiLstm(src_vocab_size=len(SRC.vocab),
                        tgt_vocab_size=len(TGT.vocab))

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")



In [9]:
# train_model(model, train_loader, val_loader, optimizer, SRC, TGT, device="cpu", num_epochs=10)
train_model(model, train_loader, val_loader, optimizer, SRC, TGT, device="mps" if torch.backends.mps.is_available() else "cpu", num_epochs=10)



[34m[1mwandb[0m: Currently logged in as: [33mdenismoldovan[0m ([33mdenismoldovan-babes-bolyai-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/10
--------------------
Batch 1/19073
--------------------
Output shape: torch.Size([32, 332, 266])
Target shape: torch.Size([32, 332])


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.