## Imports

In [1]:
import re
from transformers import MarianTokenizer
from transformers import MarianMTModel, MarianTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Data loading

In [2]:
italian_file_path = 'europarl-v7.it-en.it'
english_file_path = 'europarl-v7.it-en.en'

def load_sentences(file_path):
    with open(file_path, encoding='utf-8') as file:
        sentences = file.read().split('\n')
    return sentences

italian_sentences = load_sentences(italian_file_path)
english_sentences = load_sentences(english_file_path)

# Test data loading
print(f"Total Italian sentences: {len(italian_sentences)}")
print(f"Total English sentences: {len(english_sentences)}")

# Print the first 5 sentences in both languages to check
for i in range(5):
    print(f"Italian sentence {i+1}: {italian_sentences[i]}")
    print(f"English sentence {i+1}: {english_sentences[i]}\n")

Total Italian sentences: 1909116
Total English sentences: 1909116
Italian sentence 1: Ripresa della sessione
English sentence 1: Resumption of the session

Italian sentence 2: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
English sentence 2: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

Italian sentence 3: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
English sentence 3: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disa

## Data preprocessing

In [3]:
def clean_text_for_translation(text):
    # Simplified cleaning process, since Marian handles some normalization internally
    text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML breaks
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip()

# Clean the sentences
italian_cleaned_for_translation = [clean_text_for_translation(text) for text in italian_sentences]
english_cleaned_for_translation = [clean_text_for_translation(text) for text in english_sentences]


## Marian Implementation

In [4]:
src_text = italian_cleaned_for_translation[:5]  # Translate the first 5 cleaned sentences

model_name = "Helsinki-NLP/opus-mt-it-en"

# Initialize the tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenize the texts and prepare input tensors
tokenized_text = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True)

# Generate translation output tensors
translated_tensors = model.generate(**tokenized_text)

# Decode the tensors to get the translated texts
translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tensors]

for original, translation in zip(src_text, translated_texts):
    print(f"Original: {original}")
    print(f"Translated: {translation}\n")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Original: Ripresa della sessione
Translated: Resumption of the session

Original: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
Translated: I declare resumed the session of the European Parliament adjourned on Friday 17 December and I renew to all my best wishes in the hope that you have had a good holiday.

Original: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
Translated: As you will have seen, the great "baco of the millennium" has not materialized; instead, the citizens of some of our countries have been affected by natural disasters of truly terrible proportions.

Original: Avete chiesto che si tenesse una discussione su tale tema nei prossimi giorni, nel corso della presente tornat

## Set up DataLoaders

In [17]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, tokenizer, model_name, src_texts, tgt_texts):
        self.tokenizer = tokenizer
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.model_name = model_name

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        src_tokenized = self.tokenizer(src_text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
        tgt_tokenized = self.tokenizer(tgt_text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")

        return {
            "input_ids": src_tokenized.input_ids.flatten(),
            "attention_mask": src_tokenized.attention_mask.flatten(),
            "labels": tgt_tokenized.input_ids.flatten(),
        }

def create_data_loader(tokenizer, model_name, src_texts, tgt_texts, batch_size=16):
    dataset = TranslationDataset(tokenizer, model_name, src_texts, tgt_texts)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

# Split data into train, validation, and test sets
from sklearn.model_selection import train_test_split

italian_train, italian_valtest, english_train, english_valtest = train_test_split(italian_cleaned_for_translation, english_cleaned_for_translation, test_size=0.2, random_state=42)
italian_val, italian_test, english_val, english_test = train_test_split(italian_valtest, english_valtest, test_size=0.5, random_state=42)

# Create data loaders for training, validation, and testing
train_loader = create_data_loader(tokenizer, model_name, italian_train, english_train)
val_loader = create_data_loader(tokenizer, model_name, italian_val, english_val)
test_loader = create_data_loader(tokenizer, model_name, italian_test, english_test)


In [18]:
import torch

from transformers import AdamW
from tqdm import tqdm

# Set up the optimizer and device
optimizer = AdamW(model.parameters(), lr=5e-4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up the loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Set up the number of epochs
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Iterate over batches
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print average loss for this epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

# Test loop
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc=f"Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        test_loss += loss.item()

avg_test_loss = test_loss / len(test_loader)
print(f"Test Loss: {avg_test_loss:.4f}")


Epoch 1/3:   0%|          | 5/95456 [00:26<142:05:40,  5.36s/it]


KeyboardInterrupt: 