In [None]:
rm -rf /kaggle/working/*

In [None]:
!git clone --branch main https://github.com/giankev/Ancient-to-Modern-Italian-Automatic-Translation.git

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA (GPU) not available, using CPU.")

In [None]:
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0)) 
print(torch.cuda.get_device_name(1)) 

In [None]:
import pandas as pd

gemma2b_it_cl_translation = pd.read_csv("/kaggle/working/Ancient-to-Modern-Italian-Automatic-Translation/Gemma2b-it-translation/CulturalI-hw2_transl-Gemma2b-it_context_learning.csv")
dataset_gold = pd.read_csv("/kaggle/working/Ancient-to-Modern-Italian-Automatic-Translation/dataset/dataset_gold.csv")

In [None]:
df_concat = pd.concat([gemma2b_it_cl_translation, dataset_gold], axis=1)  
df_concat.rename(columns={'translation': 'response'}, inplace=True)
df_concat.rename(columns={'Translation': 'reference_answer'}, inplace=True)
df_concat = df_concat[["response","reference_answer"]]

In [None]:
ABS_SYSTEM_PROMPT = """You are a fair judge assistant tasked with providing clear,
                     objective feedback based on specific criteria, ensuring each assessment
                     reflects the absolute standards set for performance."""

ABSOLUTE_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer:
{reference_answer}

###Score Rubrics:
{rubric}

###Feedback: """

def build_prometheus_prompt(response_given: str, reference_answer: str):
    instruction = "Evaluate the translation from Old Italian to Modern Italian."
    response = response_given
    reference = reference_answer
    rubric = """Semantic fidelity and grammatical correctness:
                Score 1: Completely unacceptable translation: irrelevant, incomprehensible, or nonsensical.
                Score 2: Severe semantic errors, substantial omissions or additions. Defective syntax.
                Score 3: Mediocre translation with minor semantic mistakes or typos.
                Score 4: Good translation, faithful and understandable with slight stylistic imperfections.
                Score 5: Perfect translation: accurate, fluent, coherent, and semantically correct."""

    return ABS_SYSTEM_PROMPT + "\n\n" + ABSOLUTE_PROMPT.format(
        instruction=instruction,
        response=response_given,
        reference_answer=reference_answer,
        rubric=rubric
    )

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("prometheus-eval/prometheus-7b-v2.0", device_map = "auto")
tokenizer = AutoTokenizer.from_pretrained("prometheus-eval/prometheus-7b-v2.0")

In [None]:
from tqdm import tqdm
import os

In [None]:
def evaluations(df, output_csv_path):
    
    results = []
    for index, item in tqdm(df.iterrows(), total=df.shape[0], desc="Evaluations", unit="phrase"):
        response = item["response"]
        reference_answer = item["reference_answer"]
        prompt_text = build_prometheus_prompt(response, reference_answer)
        
        messages = [
            {"role": "user", "content": prompt_text},
        ]
        
        encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
        
        
        if tokenizer.pad_token_id is None:
           tokenizer.pad_token_id = tokenizer.eos_token_id

        #encodeds = {k: v.to(model.device) for k, v in encodeds.items()}

        with torch.no_grad():
            generated_ids = model.generate(
                            encodeds,
                            max_new_tokens=1000,
                            do_sample=False,   
                            temperature=0.0,    
                            pad_token_id=tokenizer.pad_token_id, 
                            eos_token_id=tokenizer.eos_token_id     
                        )
            
        decoded = tokenizer.batch_decode(generated_ids)
    
        score = ""
        if "[RESULT]" in decoded[0]:
          score = decoded[0].split("[RESULT]")[-1].strip().replace("</s>", "").strip() 
    
        if (index + 1) % 10 == 0:
                print(f"\nProcessed {index + 1} phrase.")
    
        results.append({
                'response': response,
                'reference_answer': reference_answer,
                'score': score
            })
        
    df_output = pd.DataFrame(results)
    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
    df_output.to_csv(output_csv_path, index=False, encoding='utf-8')
        
    print(f"\nTranslation complete. Results saved in: {output_csv_path}")

In [None]:
output_csv_path = "/kaggle/working/Gemma2b-it-evaluations_prometheus.csv"
evaluations(df_concat, output_csv_path)

In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/working/Gemma2b-it-evaluations_prometheus.csv")
df.to_json("/kaggle/working/Gemma2b-it-evaluations_prometheus.jsonl", orient="records", lines=True, force_ascii=False)