## Imports

In [7]:
import re
from transformers import MarianTokenizer
from transformers import MarianMTModel, MarianTokenizer

## Data loading

In [2]:
italian_file_path = 'europarl-v7.it-en.it'
english_file_path = 'europarl-v7.it-en.en'

def load_sentences(file_path):
    with open(file_path, encoding='utf-8') as file:
        sentences = file.read().split('\n')
    return sentences

italian_sentences = load_sentences(italian_file_path)
english_sentences = load_sentences(english_file_path)

# Test data loading
print(f"Total Italian sentences: {len(italian_sentences)}")
print(f"Total English sentences: {len(english_sentences)}")

# Print the first 5 sentences in both languages to check
for i in range(5):
    print(f"Italian sentence {i+1}: {italian_sentences[i]}")
    print(f"English sentence {i+1}: {english_sentences[i]}\n")

Total Italian sentences: 1909116
Total English sentences: 1909116
Italian sentence 1: Ripresa della sessione
English sentence 1: Resumption of the session

Italian sentence 2: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
English sentence 2: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

Italian sentence 3: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
English sentence 3: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disa

## Data preprocessing

In [6]:
def clean_text_for_translation(text):
    # Simplified cleaning process, since Marian handles some normalization internally
    text = re.sub(r"<br\s*/?>", " ", text)  # Remove HTML breaks
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip()

# Clean the sentences
italian_cleaned_for_translation = [clean_text_for_translation(text) for text in italian_sentences]
english_cleaned_for_translation = [clean_text_for_translation(text) for text in english_sentences]

# You don't typically need to manually add special tokens or handle padding for translation models,
# as the Marian tokenizer and model handle these internally. The main task is to ensure the input text is clean.

# Note: Depending on your use case, you may not need to clean the English sentences if you are only translating from Italian to English.


## Marian Implementation

In [8]:
# Assuming italian_cleaned_for_translation is your cleaned Italian sentences
src_text = italian_cleaned_for_translation[:5]  # Example: Translate the first 5 cleaned sentences

model_name = "Helsinki-NLP/opus-mt-it-en"

# Initialize the tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenize the texts and prepare input tensors
tokenized_text = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True)

# Generate translation output tensors
translated_tensors = model.generate(**tokenized_text)

# Decode the tensors to get the translated texts
translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tensors]

for original, translation in zip(src_text, translated_texts):
    print(f"Original: {original}")
    print(f"Translated: {translation}\n")


Original: Ripresa della sessione
Translated: Resumption of the session

Original: Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerdì 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
Translated: I declare resumed the session of the European Parliament adjourned on Friday 17 December and I renew to all my best wishes in the hope that you have had a good holiday.

Original: Come avrete avuto modo di constatare il grande "baco del millennio" non si è materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili.
Translated: As you will have seen, the great "baco of the millennium" has not materialized; instead, the citizens of some of our countries have been affected by natural disasters of truly terrible proportions.

Original: Avete chiesto che si tenesse una discussione su tale tema nei prossimi giorni, nel corso della presente tornat