In [None]:
system = """
You are an expert translation evaluator for English ↔ Filipino translations. Assess the quality of the given translation from English to Filipino based only on the provided inputs.

Evaluate according to these six criteria (1 point each):
1. Accuracy - Correct meaning, intent, and details.
2. Fluency - Grammatically correct, natural, and idiomatic Filipino.
3. Coherence - Logical flow and structure matching the source.
4. Cultural Appropriateness - Respects Filipino norms, idioms, and sensitivities.
5. Guideline Adherence - Follows any domain-specific terminology and style.
6. Completeness - Translates all elements without omission/addition.

Scoring:
- Total points: 0-6  
- Normalize to 1-5 scale:  
  • 5 = Excellent (5-6 points)  
  • 3-4 = Good (3-4 points)  
  • 1-2 = Poor (0-2 points)  

Output format (follow exactly):
Score: <number from 1-5>  
Label: <"excellent", "good", or "poor">  
Reasoning:  
Accuracy: <your comment>  
Fluency: <your comment>  
Coherence: <your comment>  
Cultural Appropriateness: <your comment>  
Guideline Adherence: <your comment>  
Completeness: <your comment>  

Be concise but clear in reasoning. Base your evaluation only on the provided inputs.

---

### Example 1
Source (English): The meeting will start at 9 a.m. sharp.  
Translation (Filipino): Ang pulong ay magsisimula nang eksakto alas-nwebe ng umaga.  

Score: 5  
Label: excellent  
Reasoning:  
Accuracy: Fully conveys the meaning and time detail.  
Fluency: Grammatically correct and natural phrasing.  
Coherence: Matches structure and intent of the source.  
Cultural Appropriateness: No issues; standard formal Filipino.  
Guideline Adherence: Appropriate for formal context.  
Completeness: All information is included without additions.  

---

### Example 2
Source (English): She gave him a cold look before leaving.  
Translation (Filipino): Tiningnan niya siya nang malamig bago umalis.  

Score: 3  
Label: good  
Reasoning:  
Accuracy: Literal translation captures meaning but slightly awkward.  
Fluency: Understandable, but “nang malamig” sounds unnatural; “matamang tingin” or “tingin na walang emosyon” would be smoother.  
Coherence: Sequence is clear and logical.  
Cultural Appropriateness: Acceptable, though more idiomatic options exist.  
Guideline Adherence: Fits general domain but not stylistically refined.  
Completeness: All information is present.  
"""



prompt_template = """
Source (English): {english_text}
Translation (Filipino): {filipino_text}
"""


In [None]:
import pandas as pd
import os
filename = "data.csv"
data = pd.read_csv(filename, encoding='utf-8')

data = data.drop(columns=['Contributor'], errors='ignore')
print(f"Loaded {len(data)} rows from {filename}")

Loaded 57 rows from data.csv


In [7]:
data.head()

Unnamed: 0,Source Text (English),Target Text (Filipino),Score,Rater 1 Explanation,Rater 2 Explanation
0,The children laughed and played under the afte...,Ang mga bata ay nagtawanan at naglaro sa ilali...,4,"Accurate, fluent, and natural translation. Cap...",Just slight error due to the literal translati...
1,She took a break to gather her thoughts.,Nagpahinga siya para mag-isip-isip.,4,The translation is accurate. It was able to ca...,The translation would have been better if the ...
2,The algorithm efficiently identifies patterns ...,Mabisang kinikilala ng algoritmo ang mga patte...,3,"The translation of ""identifies"" as ""kinikilala...",The translation would have been better if the ...
3,Data normalization helps improve model perform...,Tumutulong sa pagpabuti ng model ang normalisa...,5,The translated text is natural and captures th...,The translation didn't literally translated th...
4,alam mo ma'am masaya naman topics natin sa phi...,"You know, ma'am, we have a lot of fun philosop...",4,"flawed translation is close, but failed to tra...",


In [None]:
def construct_prompt(row, prompt_template):
    english_text = row['Source Text (English)']
    filipino_text = row['Target Text (Filipino)']
    score = row.get('Score', None)  

    prompt = prompt_template.format(english_text=english_text, filipino_text=filipino_text)
    
    return prompt, filipino_text, score

sample_prompt, translated_text, score = construct_prompt(data.iloc[1], prompt_template)

print("Sample Prompt:")
print(sample_prompt) 
print(score)

Sample Prompt:

Source (English): She took a break to gather her thoughts.
Translation (Filipino): Nagpahinga siya para mag-isip-isip.

4


In [None]:
from google import genai
from google.genai import types
import time
import os
import re

os.environ["GEMINI_API_KEY"] = ""

def query_gemini(prompt):
    client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
    model = "gemini-2.0-flash"
    
    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=prompt)],
        )
    ]
    
    generate_content_config = types.GenerateContentConfig(
        temperature=0.0,
        system_instruction=system,
        response_mime_type="text/plain",
    )
    
    response = client.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config,
    )
    
    return response.text.strip() if response and response.text else "unknown"

results = []
# count = 48
# max = len(data)
for i, row in data.iterrows():
    prompt, translated_text, score = construct_prompt(row, prompt_template)

    #print(f"Querying Gemini for row {i}...")
    response = query_gemini(prompt)

    score_match = re.search(r"Score:\s*(\d+)", response)
    if score_match:
        score_match = int(score_match.group(1))
    else:
        score_match = None
    correct_label = "excellent" if score_match == 5 else "good" if score_match >= 3 else "poor"

    accuracy = re.search(r"Accuracy:\s*(.*)", response)
    fluency = re.search(r"Fluency:\s*(.*)", response)
    coherence = re.search(r"Coherence:\s*(.*)", response)
    cultural_appropriateness = re.search(r"Cultural Appropriateness:\s*(.*)", response)
    guideline_adherence = re.search(r"Guideline Adherence:\s*(.*)", response)
    completeness = re.search(r"Completeness:\s*(.*)", response)

    accuracy = accuracy.group(1).strip() if accuracy else ""
    fluency = fluency.group(1).strip() if fluency else ""
    coherence = coherence.group(1).strip() if coherence else ""
    cultural_appropriateness = cultural_appropriateness.group(1).strip() if cultural_appropriateness else ""
    guideline_adherence = guideline_adherence.group(1).strip() if guideline_adherence else ""
    completeness = completeness.group(1).strip() if completeness else ""

    comment1 = data.iloc[i]['Rater 1 Explanation']
    comment2 = data.iloc[i]['Rater 2 Explanation']
    

    results.append({
        "prompt": prompt,
        "response": response,
        "original_score": int(score) if score is not None else None,
        "llm_score": score_match,
        "accuracy": accuracy,
        "fluency": fluency,
        "coherence": coherence,
        "cultural_appropriateness": cultural_appropriateness,
        "guideline_adherence": guideline_adherence,
        "completeness": completeness,
        "correct_label": correct_label,
        "rater_1_comment": comment1 if pd.notna(comment1) else "",
        "rater_2_comment": comment2 if pd.notna(comment2) else "",
    })



    time.sleep(5) 

import json
with open("part1_results_7.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

# 2.0 = results (sys 0), results_1 (sys 1), results_3 (system_3), results_5 (sys 4), results_6 (sys 5), results_7 (sys 6)
# 2.5 = results_2 (sys1), results_4 (sys 4)

In [None]:
edited_results = []
for i, row in enumerate(results):
    prompt, response, score = row['prompt'], row['response'], row['original_score']

    score_match = re.search(r"Score:\s*(\d+)", response)
    if score_match:
        score_match = int(score_match.group(1))
    else:
        score_match = None
    correct_label = "excellent" if score_match == 5 else "good" if score_match >= 3 else "poor"

    accuracy = re.search(r"Accuracy:\s*(.*)", response)
    fluency = re.search(r"Fluency:\s*(.*)", response)
    coherence = re.search(r"Coherence:\s*(.*)", response)
    cultural_appropriateness = re.search(r"Cultural Appropriateness:\s*(.*)", response)
    guideline_adherence = re.search(r"Guideline Adherence:\s*(.*)", response)
    completeness = re.search(r"Completeness:\s*(.*)", response)

    accuracy = accuracy.group(1).strip() if accuracy else ""
    fluency = fluency.group(1).strip() if fluency else ""
    coherence = coherence.group(1).strip() if coherence else ""
    cultural_appropriateness = cultural_appropriateness.group(1).strip() if cultural_appropriateness else ""
    guideline_adherence = guideline_adherence.group(1).strip() if guideline_adherence else ""
    completeness = completeness.group(1).strip() if completeness else ""

    comment1 = data.iloc[i]['Rater 1 Explanation']
    comment2 = data.iloc[i]['Rater 2 Explanation']
    

    edited_results.append({
        "prompt": prompt,
        "response": response,
        "original_score": int(score) if score is not None else None,
        "llm_score": score_match,
        "accuracy": accuracy,
        "fluency": fluency,
        "coherence": coherence,
        "cultural_appropriateness": cultural_appropriateness,
        "guideline_adherence": guideline_adherence,
        "completeness": completeness,
        "correct_label": correct_label,
        "rater_1_comment": comment1 if pd.notna(comment1) else "",
        "rater_2_comment": comment2 if pd.notna(comment2) else "",
    })


In [None]:

human_scores = [row['original_score'] for row in results if row['original_score'] is not None]
llm_scores = [row['llm_score'] for row in results if row['llm_score'] is not None]

from scipy.stats import spearmanr
rho, pval = spearmanr(human_scores, llm_scores)
print(f"Spearman correlation between human and LLM scores: {rho:.4f}, p-value: {pval:.4f}")

agreement = sum(1 for h, l in zip(human_scores, llm_scores) if h == l) / len(human_scores)
print(f"Exact agreement between human and LLM scores: {agreement:.2%}")

Spearman correlation between human and LLM scores: 0.4591, p-value: 0.0003
Exact agreement between human and LLM scores: 31.58%


In [None]:
import json
from scipy.stats import spearmanr, pearsonr
from math import sqrt
from statistics import mean

filenames = [
    "part1_results", "part1_results_1", "part1_results_2", 
    "part1_results_3", "part1_results_4", "part1_results_5", 
    "part1_results_6", "part1_results_7", 
]

for filename in filenames:
    with open(f"{filename}.json", "r", encoding="utf-8") as f:
        results = json.load(f)

    llm_scores = [row['llm_score'] for row in results if row['llm_score'] is not None]
    human_scores = [row['original_score'] for row in results if row['original_score'] is not None]

    rho, pval_s = spearmanr(human_scores, llm_scores)
    print(f"\nSpearman correlation for {filename}: {rho:.4f}, p-value: {pval_s:.4f}")

    r, pval_p = pearsonr(human_scores, llm_scores)
    print(f"Pearson correlation for {filename}: {r:.4f}, p-value: {pval_p:.4f}")

    rmse = sqrt(mean((h - l) ** 2 for h, l in zip(human_scores, llm_scores)))
    print(f"RMSE for {filename}: {rmse:.4f}")

    agreement = sum(1 for h, l in zip(human_scores, llm_scores) if h == l) / len(human_scores)
    print(f"Exact agreement for {filename}: {agreement:.2%}")

    partial_agreement = sum(1 for h, l in zip(human_scores, llm_scores) if abs(h - l) <= 1) / len(human_scores)
    print(f"Partial agreement for {filename}: {partial_agreement:.2%}")

    for score_value in range(1, 6):
        count = sum(1 for h, l in zip(human_scores, llm_scores) if h == score_value and l == score_value)
        total = sum(1 for h in human_scores if h == score_value)
        agreement = count / total if total > 0 else 0
        print(f"Agreement for score {score_value}: {agreement:.2%} ({count}/{total})")



Spearman correlation for part1_results: 0.4616, p-value: 0.0003
Pearson correlation for part1_results: 0.4267, p-value: 0.0009
RMSE for part1_results: 1.4388
Exact agreement for part1_results: 31.58%
Partial agreement for part1_results: 68.42%
Agreement for score 1: 25.00% (1/4)
Agreement for score 2: 0.00% (0/10)
Agreement for score 3: 0.00% (0/15)
Agreement for score 4: 42.86% (6/14)
Agreement for score 5: 78.57% (11/14)

Spearman correlation for part1_results_1: 0.4590, p-value: 0.0003
Pearson correlation for part1_results_1: 0.4563, p-value: 0.0004
RMSE for part1_results_1: 1.4985
Exact agreement for part1_results_1: 29.82%
Partial agreement for part1_results_1: 63.16%
Agreement for score 1: 0.00% (0/4)
Agreement for score 2: 20.00% (2/10)
Agreement for score 3: 6.67% (1/15)
Agreement for score 4: 14.29% (2/14)
Agreement for score 5: 85.71% (12/14)

Spearman correlation for part1_results_2: 0.3392, p-value: 0.0098
Pearson correlation for part1_results_2: 0.3665, p-value: 0.0051
RM

In [None]:
flag = False
for row in results:
    if any(not row.get(k, "").strip() for k in [
        "accuracy", "fluency", "coherence",
        "cultural_appropriateness", "guideline_adherence", "completeness"
    ]):
        print(f"Row {row['row_index']} is missing some criteria.")
        flag = True

if not flag:
    print("All rows have all criteria.")


All rows have all criteria.


In [None]:
import random
import numpy as np
import re
import time
import json
from google import genai
from google.genai import types

import os
os.environ["GEMINI_API_KEY"] = ""

def query_gemini_consistency(prompt):
    client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
    model = "gemini-2.5-flash"

    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=prompt)],
        )
    ]

    generate_content_config = types.GenerateContentConfig(
        temperature=0.35, 
        system_instruction=system,
        response_mime_type="text/plain",
    )

    response = client.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config,
    )

    return response.text.strip() if response and response.text else "unknown"


sample_size = 15
random.seed(42)
sample_indices = random.sample(range(len(data)), sample_size)

consistency_results = {}

for idx in sample_indices:
    row = data.iloc[idx]
    prompt, translated_text, score = construct_prompt(row, prompt_template)
    scores = []

    for run in range(5):
        print(f"Consistency run {run+1} for row {idx}...")
        response = query_gemini_consistency(prompt)

        score_match = re.search(r"Score:\s*(\d+)", response)
        if score_match:
            scores.append(int(score_match.group(1)))
        else:
            scores.append(None)

        time.sleep(5) 

    consistency_results[idx] = {
        "prompt": prompt,
        "scores": scores
    }

deviations = []
for idx, result in consistency_results.items():
    valid_scores = [s for s in result["scores"] if s is not None]
    if len(valid_scores) > 1:
        mean_score = np.mean(valid_scores)
        std_dev = np.std(valid_scores)
        deviation_percent = (std_dev / mean_score) * 100 if mean_score != 0 else 0
        deviations.append(deviation_percent)

overall_deviation = np.mean(deviations) if deviations else None
print(f"Average deviation across sample: {overall_deviation:.2f}%")

with open("consistency_results_1.json", "w", encoding="utf-8") as f:
    json.dump(consistency_results, f, ensure_ascii=False, indent=4)


Consistency run 1 for row 40...
Consistency run 2 for row 40...
Consistency run 3 for row 40...
Consistency run 4 for row 40...
Consistency run 5 for row 40...
Consistency run 1 for row 7...
Consistency run 2 for row 7...
Consistency run 3 for row 7...
Consistency run 4 for row 7...
Consistency run 5 for row 7...
Consistency run 1 for row 1...
Consistency run 2 for row 1...
Consistency run 3 for row 1...
Consistency run 4 for row 1...
Consistency run 5 for row 1...
Consistency run 1 for row 47...
Consistency run 2 for row 47...
Consistency run 3 for row 47...
Consistency run 4 for row 47...
Consistency run 5 for row 47...
Consistency run 1 for row 17...
Consistency run 2 for row 17...
Consistency run 3 for row 17...
Consistency run 4 for row 17...
Consistency run 5 for row 17...
Consistency run 1 for row 15...
Consistency run 2 for row 15...
Consistency run 3 for row 15...
Consistency run 4 for row 15...
Consistency run 5 for row 15...
Consistency run 1 for row 14...
Consistency run 2 

In [41]:
overall_deviation = np.mean(deviations) if deviations else None
print(f"Average deviation across sample: {overall_deviation:.2f}%")

Average deviation across sample: 1.27%


In [None]:
import json
import numpy as np

with open("consistency_results.json", "r", encoding="utf-8") as f:
    consistency_results = json.load(f)

print(f"{'Index':<8} {'Mean Score':<12} {'Std Dev':<10} {'Deviation %':<12} {'Scores'}")
print("-" * 60)

deviations = []

for idx, result in consistency_results.items():
    scores = [s for s in result["scores"] if s is not None]
    if len(scores) > 1:
        mean_score = np.mean(scores)
        std_dev = np.std(scores)
        deviation_percent = (std_dev / mean_score) * 100 if mean_score != 0 else 0
        deviations.append(deviation_percent)
        print(f"{idx:<8} {mean_score:<12.2f} {std_dev:<10.2f} {deviation_percent:<12.2f} {scores}")

if deviations:
    overall_deviation = np.mean(deviations)
    print("-" * 60)
    print(f"Overall average deviation: {overall_deviation:.2f}%")
else:
    print("No valid scores found.")


Index    Mean Score   Std Dev    Deviation %  Scores
------------------------------------------------------------
40       5.00         0.00       0.00         [5, 5, 5, 5, 5]
7        5.00         0.00       0.00         [5, 5, 5, 5, 5]
1        4.60         0.49       10.65        [5, 4, 5, 5, 4]
47       4.00         0.00       0.00         [4, 4, 4, 4, 4]
17       5.00         0.00       0.00         [5, 5, 5, 5, 5]
15       4.00         0.00       0.00         [4, 4, 4, 4, 4]
14       3.00         0.00       0.00         [3, 3, 3, 3, 3]
8        5.00         0.00       0.00         [5, 5, 5, 5, 5]
53       4.80         0.40       8.33         [5, 5, 5, 4, 5]
6        5.00         0.00       0.00         [5, 5, 5, 5, 5]
43       4.00         0.00       0.00         [4, 4, 4, 4, 4]
34       5.00         0.00       0.00         [5, 5, 5, 5, 5]
5        1.00         0.00       0.00         [1, 1, 1, 1, 1]
37       5.00         0.00       0.00         [5, 5, 5, 5, 5]
27       4.00     

In [11]:
import json
import numpy as np

with open("consistency_results_1.json", "r", encoding="utf-8") as f:
    consistency_results = json.load(f)

print(f"{'Index':<8} {'Mean Score':<12} {'Std Dev':<10} {'Deviation %':<12} {'Scores'}")
print("-" * 60)

deviations = []

for idx, result in consistency_results.items():
    scores = [s for s in result["scores"] if s is not None]
    if len(scores) > 1:
        mean_score = np.mean(scores)
        std_dev = np.std(scores)
        deviation_percent = (std_dev / mean_score) * 100 if mean_score != 0 else 0
        deviations.append(deviation_percent)
        print(f"{idx:<8} {mean_score:<12.2f} {std_dev:<10.2f} {deviation_percent:<12.2f} {scores}")

if deviations:
    overall_deviation = np.mean(deviations)
    print("-" * 60)
    print(f"Overall average deviation: {overall_deviation:.2f}%")
else:
    print("No valid scores found.")


Index    Mean Score   Std Dev    Deviation %  Scores
------------------------------------------------------------
40       5.00         0.00       0.00         [5, 5, 5, 5, 5]
7        5.00         0.00       0.00         [5, 5, 5, 5, 5]
1        5.00         0.00       0.00         [5, 5, 5, 5, 5]
47       5.00         0.00       0.00         [5, 5, 5, 5, 5]
17       5.00         0.00       0.00         [5, 5, 5, 5, 5]
15       2.60         0.80       30.77        [3, 3, 3, 1, 3]
14       3.00         0.00       0.00         [3, 3, 3, 3, 3]
8        5.00         0.00       0.00         [5, 5, 5, 5, 5]
53       3.60         0.80       22.22        [4, 5, 3, 3, 3]
6        3.60         0.80       22.22        [3, 3, 5, 4, 3]
43       3.00         0.00       0.00         [3, 3, 3, 3, 3]
34       3.80         0.40       10.53        [4, 4, 3, 4, 4]
5        1.00         0.00       0.00         [1, 1, 1, 1, 1]
37       5.00         0.00       0.00         [5, 5, 5, 5, 5]
27       5.00     

In [None]:
# stupid thing that doesnt work on windows waste of time
from vllm import LLM, SamplingParams

model_path = "./ByteDance-Seed/Seed-X-PPO-7B"

model = LLM(model=model_path,
            max_num_seqs=512,
            tensor_parallel_size=8,
            enable_prefix_caching=True, 
            gpu_memory_utilization=0.95,

            )


def back_translate(filipino_text: str) -> str:
    messages = [
        "Translate the following Filipino sentence into English:\nAng ganda mo!<en>",
    ]

    # Beam search (We recommend using beam search decoding)
    decoding_params = BeamSearchParams(beam_width=4,
                                    max_tokens=512)
    # Greedy decoding
    decoding_params = SamplingParams(temperature=0,
                                    max_tokens=512,
                                    skip_special_tokens=True)

    results = model.generate(messages, decoding_params)
    responses = [res.outputs[0].text.strip() for res in results]

    print(responses)


INFO 08-10 23:57:48 [__init__.py:235] Automatically detected platform cuda.


ModuleNotFoundError: No module named 'vllm._C'

In [None]:
import os

os.environ.setdefault("GEMINI_API_KEY", "")


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-3.3B",
    src_lang="tgl_Latn",  
    use_auth_token=True,
    cache_dir="D:/_GitRepos/Thesis/huggingface_cache"
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-3.3B",
    use_auth_token=True,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16,
    bnb_4bit_compute_dtype=torch.float16,
    cache_dir="D:/_GitRepos/Thesis/huggingface_cache"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import json
from typing import List, Dict, Tuple
from google import genai
from google.genai import types
import time
os.environ.setdefault("GEMINI_API_KEY", "")

API_KEY = os.environ.get("GEMINI_API_KEY")
MODEL = "gemini-2.5-flash-lite"   
RATE_LIMIT_SLEEP = 5 
MAX_RETRIES = 3

#from google.cloud import translate_v2 as translate
# def back_translate(filipino_text: str) -> str:
#     result = client.translate(filipino_text, target_language="en", source_language="tl")
#     return result["translatedText"]

# client = translate.Client()

import torch

# def back_translate(filipino_text: str) -> str:
#     inputs = tokenizer(filipino_text, return_tensors="pt")
#     if torch.cuda.is_available():
#         inputs = {k: v.to("cuda") for k, v in inputs.items()}

#     # Specify forced_bos_token_id for English output ("eng_Latn")
#     forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")

#     outputs = model.generate(
#         **inputs,
#         forced_bos_token_id=forced_bos_token_id,
#         max_length=128,
#         do_sample=False,  # greedy decoding
#     )

#     translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return translated_text

import requests
# pip install libretranslate
# libretranslate
def back_translate(text: str) -> str:
    source_lang = "en"
    target_lang = "tl"
    url = "http://127.0.0.1:5000/translate" #my localhost
    data = {
        "q": text,
        "source": source_lang,
        "target": target_lang,
        "format": "text"
    }
    response = requests.post(url, data=data)
    if response.status_code == 200:
        translated = response.json().get("translatedText", "")
        return translated
    else:
        print(f"Error: {response.status_code} {response.text}")
        return ""



def call_gemini(system_instruction: str, user_prompt: str, temperature: float = 0.0) -> str:
    client = genai.Client(api_key=API_KEY)
    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=user_prompt)],
        )
    ]
    config = types.GenerateContentConfig(
        temperature=temperature,
        system_instruction=system_instruction,
        response_mime_type="text/plain"
    )

    for attempt in range(MAX_RETRIES):
        try:
            response = client.models.generate_content(
                model=MODEL,
                contents=contents,
                config=config,
            )
            text = response.text or ""
            return text.strip()
        except Exception as e:
            print(f"[call_gemini] Attempt {attempt+1} failed: {e}")
            time.sleep(RATE_LIMIT_SLEEP * (attempt + 1))
    raise RuntimeError("call_gemini failed after retries")


def evaluate_pair(english: str, filipino: str, human_score: int, explanation1: str, explanation2: str) -> Dict:
    import re

    paraphrase_prompt = f"""
    Paraphrase the following sentences concisely. The sentences are separate, do not think about them together. Only provide one paraphrase for each:

    English sentence: {english}

    Filipino sentence (translate and paraphrase into English): {filipino}

    Respond with the paraphrases in this format:

    English paraphrase: [your paraphrase here]
    Filipino paraphrase (in English): [your paraphrase here]
    """

    paraphrase_response = call_gemini(
        system_instruction="You are a helpful paraphraser. You are intelligent. Simply do as instructed.",
        user_prompt=paraphrase_prompt
    )

    pattern = re.compile(
        r"English paraphrase:\s*(?P<eng>.+?)\s*Filipino paraphrase \(in English\):\s*(?P<fil>.+)", 
        re.DOTALL
    )

    match = pattern.search(paraphrase_response)
    if match:
        paraphrase_src = match.group("eng").strip()
        paraphrase_tr = match.group("fil").strip()
    else:
        paraphrase_src = ""
        paraphrase_tr = ""

    back_translated = back_translate(filipino).strip()

    time.sleep(5)  
    scoring_prompt = f"""
        Given the English source, Filipino translation, and various paraphrases, score the Filipino translation from 1 to 5 on these criteria:

        Accuracy, Fluency, Coherence, Cultural Appropriateness, Completeness.

        English source: {english}
        Filipino translation: {filipino}
        Back-translation: {back_translated}

        Respond clearly with each criterion’s score and justification in this format:
        Accuracy: 4.5 |  [your explanation here]
        Fluency: 4.0 |  [your explanation here]
        Coherence: 4.2 |  [your explanation here]
        Cultural Appropriateness: 5.0 |  [your explanation here]
        Completeness: 4.8 |  [your explanation here]
        """
    scoring_response = call_gemini(
        system_instruction="You are a helpful assistant. You are an expert translator and evaluator.",
        user_prompt=scoring_prompt
    )

    time.sleep(5)  
    scores_and_justifications = {}
    for line in scoring_response.splitlines():
        if ": " in line and "|" in line:
            key_part, rest = line.split(":", 1)
            score_part, justification_part = rest.split("|", 1)
            key = key_part.strip()
            try:
                score = float(score_part.strip())
            except ValueError:
                score = None
            justification = justification_part.strip()
            scores_and_justifications[key] = {
                "score": score,
                "justification": justification
            }

    summary_prompt = f"""
        Based on these scores and justifications:

        {scoring_response}

        Provide an integer score from 1-5 of the Filipino translation, and a brief summary of the overall translation quality.
        Score:
        Summary:
        """

    summary = call_gemini(
        system_instruction="You are a helpful assistant.",
        user_prompt=summary_prompt
    )

    time.sleep(5) 

    return {
        "english": english,
        "filipino": filipino,
        "paraphrase_src": paraphrase_src,
        "paraphrase_tr": paraphrase_tr,
        "back_translated": back_translated,
        "scores_and_justifications": scores_and_justifications,
        "summary": summary,
        "human_score": human_score,
        "explanation1": explanation1,
        "explanation2": explanation2
    }


results = []
traces = []
def run_agentic_on_dataframe(df: pd.DataFrame,
                             text_col_src: str,
                             text_col_tr: str,
                             out_json_path: str,
                             max_rows: int = None) -> Tuple[List[Dict], List[Dict]]:
    count = 0
    min = 55

    rows = df.head(max_rows) if max_rows else df
    for idx, row in rows.iterrows():
        # if count < min:
        #     count += 1
        #     continue
        print("Running evaluation for row:", idx)
        english = str(row[text_col_src]).strip()
        filipino = str(row[text_col_tr]).strip()
        human_score = int(row.get("Score", None))
        explanation1 = str(row.get("Rater 1 Explanation", "")).strip()
        explanation2 = str(row.get("Rater 2 Explanation", "")).strip()

        eval_result = evaluate_pair(english, filipino, human_score, explanation1, explanation2)
        results.append(eval_result)

        traces.append({
            "row_index": idx,
            "english": english,
            "filipino": filipino,
            "paraphrase_src": eval_result["paraphrase_src"],
            "paraphrase_tr": eval_result["paraphrase_tr"],
            "back_translated": eval_result["back_translated"]
        })

    with open(out_json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    return results, traces

if __name__ == "__main__":
    data = pd.read_csv("data1.csv")

    out_path = "agentic_results_13_libretl.json"
    res, traces = run_agentic_on_dataframe(
        data,
        text_col_src="English",
        text_col_tr="Filipino",
        out_json_path=out_path,
        max_rows= None
    )
    print(f"Saved {len(res)} evaluation results to {out_path}")


Running evaluation for row: 0
Running evaluation for row: 1
Running evaluation for row: 2
Running evaluation for row: 3
Running evaluation for row: 4
Running evaluation for row: 5
Running evaluation for row: 6
Running evaluation for row: 7
Running evaluation for row: 8
Running evaluation for row: 9
Running evaluation for row: 10
Running evaluation for row: 11
Running evaluation for row: 12
Running evaluation for row: 13
Running evaluation for row: 14
Running evaluation for row: 15
Running evaluation for row: 16
Running evaluation for row: 17
Running evaluation for row: 18
Running evaluation for row: 19
Running evaluation for row: 20
Running evaluation for row: 21
Running evaluation for row: 22
Running evaluation for row: 23
Running evaluation for row: 24
Running evaluation for row: 25
Running evaluation for row: 26
Running evaluation for row: 27
Running evaluation for row: 28
Running evaluation for row: 29
Running evaluation for row: 30
Running evaluation for row: 31
Running evaluation

In [None]:
import json
from scipy.stats import spearmanr, pearsonr
from math import sqrt
from statistics import mean
import re

filename = "agentic_results_13_libretl.json"

with open(filename, "r", encoding="utf-8") as f:
    agentic_results_3 = json.load(f)

human_scores = []
llm_scores = []

for row in agentic_results_3:
    if row.get("human_score") is None:
        continue 
    match = re.search(r"Score:\s*(\d+)", row.get("summary", ""))
    if match:
        llm_score = int(match.group(1))
        llm_scores.append(llm_score)
        human_scores.append(row["human_score"])

assert len(human_scores) == len(llm_scores), "Mismatch in score counts!"

rho, pval_s = spearmanr(human_scores, llm_scores)
print(f"\nSpearman correlation for {filename}: {rho:.4f}, p-value: {pval_s:.4f}")

r, pval_p = pearsonr(human_scores, llm_scores)
print(f"Pearson correlation for {filename}: {r:.4f}, p-value: {pval_p:.4f}")

rmse = sqrt(mean((h - l) ** 2 for h, l in zip(human_scores, llm_scores)))
print(f"RMSE for {filename}: {rmse:.4f}")

agreement = sum(1 for h, l in zip(human_scores, llm_scores) if h == l) / len(human_scores)
print(f"Exact agreement for {filename}: {agreement:.2%}")

partial_agreement = sum(1 for h, l in zip(human_scores, llm_scores) if abs(h - l) <= 1) / len(human_scores)
print(f"Partial agreement for {filename}: {partial_agreement:.2%}")

for score_value in range(1, 6):
    count = sum(1 for h, l in zip(human_scores, llm_scores) if h == score_value and l == score_value)
    total = sum(1 for h in human_scores if h == score_value)
    agreement = count / total if total > 0 else 0
    print(f"Agreement for score {score_value}: {agreement:.2%} ({count}/{total})")



Spearman correlation for agentic_results_13_libretl.json: 0.4156, p-value: 0.0013
Pearson correlation for agentic_results_13_libretl.json: 0.4005, p-value: 0.0020
RMSE for agentic_results_13_libretl.json: 1.2283
Exact agreement for agentic_results_13_libretl.json: 29.82%
Partial agreement for agentic_results_13_libretl.json: 78.95%
Agreement for score 1: 25.00% (1/4)
Agreement for score 2: 0.00% (0/10)
Agreement for score 3: 0.00% (0/15)
Agreement for score 4: 100.00% (14/14)
Agreement for score 5: 14.29% (2/14)
