In [None]:
# pip install transformers sentencepiece torch pandas sacremoses


In [1]:
import os
import csv
import json
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

Load the CSV file
- The CSV should be named "dataset.csv"
- It must contain a column "sentence" with Italian sentences.

In [3]:
def parse(file_path = 'dataset.csv'):
    # Get absolute path
    # current_dir = os.path.dirname(os.path.abspath(__file__))
    # file_path = os.path.join(current_dir, file_path)

    # Get data as dictionary
    data = []
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    
    return data

data = parse()
sentences = [row["Sentence"] for row in data]

Load models & tokenizers

In [4]:
model_name_it2en = "Helsinki-NLP/opus-mt-it-en"
model_name_en2it = "Helsinki-NLP/opus-mt-en-it"

tokenizer_it2en = MarianTokenizer.from_pretrained(model_name_it2en)
model_it2en = MarianMTModel.from_pretrained(model_name_it2en)

tokenizer_en2it = MarianTokenizer.from_pretrained(model_name_en2it)
model_en2it = MarianMTModel.from_pretrained(model_name_en2it)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_it2en.to(device)
model_en2it.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(80035, 512, padding_idx=80034)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(80035, 512, padding_idx=80034)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

Translate archaic italian to english

In [5]:
encoded_it = tokenizer_it2en(
    sentences,
    return_tensors="pt",
    padding=True,
    truncation=True
)
input_ids_it = encoded_it["input_ids"].to(device)
attention_mask_it = encoded_it["attention_mask"].to(device)

with torch.no_grad():
    generated_en = model_it2en.generate(
        input_ids=input_ids_it,
        attention_mask=attention_mask_it,
        max_length=512,
        num_beams=8,
        length_penalty=1.2,
        early_stopping=True
    )
english_translations = tokenizer_it2en.batch_decode(
    generated_en,
    skip_special_tokens=True
)
# data["english"] = english_translations

Translate english to italian

In [6]:
encoded_en = tokenizer_en2it(
    english_translations,
    return_tensors="pt",
    padding=True,
    truncation=True
)
input_ids_en = encoded_en["input_ids"].to(device)
attention_mask_en = encoded_en["attention_mask"].to(device)

with torch.no_grad():
    generated_it_back = model_en2it.generate(
        input_ids=input_ids_en,
        attention_mask=attention_mask_en,
        max_length=512,
        num_beams=8,
        length_penalty=1.2,
        early_stopping=True
    )
italian_translations = tokenizer_en2it.batch_decode(
    generated_it_back,
    skip_special_tokens=True
)
# data["italian"] = italian_translations

Save to jsonl

In [7]:
output_path = "CulturalIA-hw2_transl-opusmt.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for sent in italian_translations:
        # Each line is a JSON object: {"sentence": "<the Italian sentence>"}
        line = json.dumps({"Sentence": sent}, ensure_ascii=False)
        f.write(line + "\n")

print(f"Saved {len(italian_translations)} sentences to '{output_path}'")

Saved 97 sentences to 'CulturalIA-hw2_transl-opusmt.jsonl'


Optionally, save also in CSV format, to allow for easier manual annotation of translation scores

In [8]:
output_csv_path = "CSV-CulturalIA-hw2_transl-opusmt.csv"

with open(output_csv_path, mode='w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Sentence'])  # Write header
    for sent in italian_translations:
        writer.writerow([sent])

print(f"Saved {len(italian_translations)} sentences to '{output_csv_path}'")

Saved 97 sentences to 'CSV-CulturalIA-hw2_transl-opusmt.csv'
