In [None]:
!git clone --branch main https://github.com/giankev/Ancient-to-Modern-Italian-Automatic-Translation.git

In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/working/Ancient-to-Modern-Italian-Automatic-Translation/dataset/dataset_cleaned.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
sentence = df.drop(columns=["Author", "Date", "Region"])

In [None]:
sentence.head()

In [None]:
#!huggingface-cli login

In [None]:
from kaggle_secrets import UserSecretsClient
import huggingface_hub
import os

try:
    user_secrets = UserSecretsClient()
    hf_login_token = user_secrets.get_secret("gemma") 
    
    print("Attempting programmatic login to Hugging Face Hub...")
    huggingface_hub.login(token=hf_login_token, add_to_git_credential=False)
    print("Programmatic login successful or token set for this session.")

except Exception as e:
    print(f"Error during programmatic login: {e}")
    print("Please ensure your HF_TOKEN secret is correctly set in Kaggle.")

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA (GPU) not available, using CPU.")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    torch_dtype=torch.float16,
    attn_implementation="sdpa"
).to(device)

In [None]:
from tqdm import tqdm

In [None]:
def prompt_formatting(input_text: str, context_learning: bool, examples: str):
    
    if input_text is None or input_text == '':
        raise ValueError("Not valid input text")

    if context_learning:
        
        prompt = (
             f"<start_of_turn>user\nTraduci le seguenti frasi da italiano antico a italiano moderno." 
             f"Rispondi solo con la frase tradotta, senza commenti o formattazione aggiuntiva.\n"
             f"Usa uno schema come questo:"
        )

        for old, modern in examples:
            prompt += f"Antico: \"{old}\"\nModerno: \"{modern}\"\n"

        prompt += f"Antico: \"{input_text}\"\nModerno:<end_of_turn>\n<start_of_turn>model\n"

    else:
        
        prompt = (
            f"<start_of_turn>user\n"
            f"Traduci solo questa frase da italiano antico a italiano moderno. "
            f"Rispondi solo con la frase tradotta, senza commenti o formattazione aggiuntiva:\n"
            f"\"{input_text}\"\n"
            f"<end_of_turn>\n<start_of_turn>model\n"
        )

    return prompt        

In [None]:
def output_translations(df, output_csv_path, context_learning = False, examples = ""):
    
    translations = []
    for index, item in tqdm(df.iterrows(), total=df.shape[0], desc="Translations", unit="phrase"):
    
        sentence = item["Sentence"]
        input_prompt = prompt_formatting(sentence, context_learning, examples)
        input_ids = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
        with torch.no_grad():
            outputs = model.generate(
                    **input_ids,
                    max_new_tokens=128,
                    cache_implementation="static",
                    temperature = 0.2,
                    do_sample=True,
                    eos_token_id=tokenizer.eos_token_id,
            )
    
        input_prompt_token_length = input_ids['input_ids'].shape[1]
        generated_token_only = outputs[0][input_prompt_token_length:]
        translation = tokenizer.decode(generated_token_only, skip_special_tokens=True).strip()
    
        translations.append({
            'old_text': sentence,
            'translation': translation
        })
    
        if (index + 1) % 10 == 0:
            print(f"\nProcessed {index + 1} phrase.")
    
    df_output = pd.DataFrame(translations)
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    df_output.to_csv(output_csv_path, index=False, encoding='utf-8')
    
    print(f"\nTranslation complete. Results saved in: {output_csv_path}")

In [None]:
output_csv_path = "/kaggle/working/Gemma2b-it-translations_context_learning.csv"
context_learning = True
examples = [
            ("Oime! ch’i’ veggio per alcun sentiero", "Ahimè! che io vedo per qualche sentiero"),
            ("Costui pareva aver viso di savio", "Costui sembrava avere l'aspetto di un uomo saggio"),
            ("Che tu ne se' la cagion di mia doglia", "Che tu sei la causa del mio dolore")
]

output_translations(df, output_csv_path, context_learning, examples)