In [2]:
import os, json, requests, time
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import answer_similarity 
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings.base import embedding_factory
from langchain_google_genai import ChatGoogleGenerativeAI


In [3]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyAbc8VTKoFfXGuMZbc0OvQ_L7U-K8LHg1A"

In [4]:
os.environ["DATA_PATH"] = "/workspace/QAs_Hukumonline_Test.json"  # uploaded path
os.environ["VLLM_BASE"] = "http://127.0.0.1:8000"
os.environ["VLLM_MODEL"] = "google/gemma-3-4b-it"
os.environ["GEMINI_MODEL"] = "gemini-2.5-flash"

assert "GOOGLE_API_KEY" in os.environ, "Set GOOGLE_API_KEY in env first!"
print("Config ready")


Config ready


In [5]:
with open(os.environ["DATA_PATH"], "r", encoding="utf-8") as f:
    raw = json.load(f)

df = pd.DataFrame(raw).rename(columns={"instruction":"question","response":"ground_truth"})
print(f"Rows: {len(df)}")
df.head(3)


Rows: 96


Unnamed: 0,question,ground_truth
0,Apa bunyi Pasal 187 KUHP? Benarkah Pasal 187 K...,Pasal 187KUHPlama yang saat artikel ini diterb...
1,Orang gila apakah termasuk subjek hukum? Apaka...,Ilmu hukum pidana mengenal adanya alasan pengh...
2,Suami saya tersangkut kasus pencurian dengan p...,Tahanan adalah tersangka atau terdakwa yang di...


Generating answers (Gemma 3 4B): 100%|██████████| 96/96 [25:35<00:00, 15.99s/it]

Saved /workspace/gemma_generations.csv
Total time: 1535.44 seconds
Average per question: 15.99 seconds





Unnamed: 0,question,ground_truth,answer
0,Apa bunyi Pasal 187 KUHP? Benarkah Pasal 187 K...,Pasal 187KUHPlama yang saat artikel ini diterb...,Pasal 187 Kitab Undang-Undang Hukum Pidana (KU...
1,Orang gila apakah termasuk subjek hukum? Apaka...,Ilmu hukum pidana mengenal adanya alasan pengh...,Pertanyaan yang sangat menarik dan kompleks me...
2,Suami saya tersangkut kasus pencurian dengan p...,Tahanan adalah tersangka atau terdakwa yang di...,Situasi yang Anda alami sangat berat dan membu...


In [9]:
csv_path = "/workspace/gemma_generations.csv"
df = pd.read_csv(csv_path)
assert {"question","answer","ground_truth"}.issubset(df.columns), f"Missing columns: {df.columns.tolist()}"

judge_llm = ChatGoogleGenerativeAI(
    model=os.environ.get("GEMINI_MODEL", "gemini-2.5-flash"),
    google_api_key=os.environ["GOOGLE_API_KEY"],
)
llm = LangchainLLMWrapper(judge_llm)
emb = embedding_factory(provider="huggingface",
                        model="sentence-transformers/all-MiniLM-L6-v2")

ragas_ds = Dataset.from_pandas(df[["question","answer","ground_truth"]])


executor = evaluate(
    ragas_ds,
    metrics=[answer_similarity],
    llm=llm,
    embeddings=emb,
    show_progress=True,
    return_executor=True,
)

raw = None
if hasattr(executor, "results"):
    try:
        raw = executor.results()
    except Exception:
        raw = executor.results
else:
    raw = executor

if not isinstance(raw, list):
    raise RuntimeError(f"Expected a list of floats but got {type(raw)}")

per_row_df = pd.DataFrame(raw, columns=["answer_similarity_score"])
print("Per-row dataframe shape:", per_row_df.shape)
print(per_row_df.head(3))

overall_df = per_row_df.mean().to_frame().T
overall_df.columns = [c + "_mean" for c in overall_df.columns]

overall_df.to_csv("/workspace/gemma_ragas_overall.csv", index=False)
per_row_df.to_csv("/workspace/gemma_ragas_per_row.csv", index=False)

print("Saved:")
print("  - /workspace/gemma_ragas_overall.csv")
print("  - /workspace/gemma_ragas_per_row.csv")

overall_df


  llm = LangchainLLMWrapper(judge_llm)
Evaluating: 100%|██████████| 96/96 [00:03<00:00, 27.49it/s]

Per-row dataframe shape: (96, 1)
   answer_similarity_score
0                 0.773469
1                 0.651248
2                 0.710802
Saved:
  - /workspace/gemma_ragas_overall.csv
  - /workspace/gemma_ragas_per_row.csv





Unnamed: 0,answer_similarity_score_mean
0,0.740955
