In [10]:
import os, json, requests, time
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import answer_similarity   # <-- new name
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings.base import embedding_factory
from langchain_google_genai import ChatGoogleGenerativeAI


In [2]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyAbc8VTKoFfXGuMZbc0OvQ_L7U-K8LHg1A"

In [3]:
os.environ["DATA_PATH"] = "/workspace/QAs_Hukumonline_Test.json"  # uploaded path
os.environ["VLLM_BASE"] = "http://127.0.0.1:8002"
os.environ["VLLM_MODEL"] = "Qwen/Qwen2.5-3B-Instruct"
os.environ["GEMINI_MODEL"] = "gemini-2.5-flash"

assert "GOOGLE_API_KEY" in os.environ, "Set GOOGLE_API_KEY in env first!"
print("Config ready")


Config ready


In [4]:
with open(os.environ["DATA_PATH"], "r", encoding="utf-8") as f:
    raw = json.load(f)

df = pd.DataFrame(raw).rename(columns={"instruction":"question","response":"ground_truth"})
print(f"Rows: {len(df)}")
df.head(3)


Rows: 96


Unnamed: 0,question,ground_truth
0,Apa bunyi Pasal 187 KUHP? Benarkah Pasal 187 K...,Pasal 187KUHPlama yang saat artikel ini diterb...
1,Orang gila apakah termasuk subjek hukum? Apaka...,Ilmu hukum pidana mengenal adanya alasan pengh...
2,Suami saya tersangkut kasus pencurian dengan p...,Tahanan adalah tersangka atau terdakwa yang di...


In [11]:
VLLM_URL = os.environ["VLLM_BASE"]
MODEL = os.environ["VLLM_MODEL"]

answers = []

start = time.time()
for q in tqdm(df["question"].tolist(), desc="Generating answers (Qwen 2.5 3B)"):
    payload = {"model": MODEL, "messages": [{"role":"user","content": q}], "temperature": 0.2}
    r = requests.post(f"{VLLM_URL}/v1/chat/completions", json=payload, timeout=300)
    r.raise_for_status()
    answers.append(r.json()["choices"][0]["message"]["content"])

end = time.time()
elapsed = end - start

df["answer"] = answers
df.to_csv("/workspace/qwen_generations.csv", index=False)
print("Saved /workspace/qwen_generations.csv")

print(f"Total time: {elapsed:.2f} seconds")
print(f"Average per question: {elapsed / len(df):.2f} seconds")

df.head(3)


Generating answers (Qwen 2.5 3B): 100%|██████████| 96/96 [15:35<00:00,  9.75s/it] 

Saved /workspace/qwen_generations.csv
Total time: 935.72 seconds
Average per question: 9.75 seconds





Unnamed: 0,question,ground_truth,answer
0,Apa bunyi Pasal 187 KUHP? Benarkah Pasal 187 K...,Pasal 187KUHPlama yang saat artikel ini diterb...,"Maaf, saya tidak memiliki akses langsung ke ko..."
1,Orang gila apakah termasuk subjek hukum? Apaka...,Ilmu hukum pidana mengenal adanya alasan pengh...,Pasal 44 dalam Kitab Undang-Undang Hukum Pidan...
2,Suami saya tersangkut kasus pencurian dengan p...,Tahanan adalah tersangka atau terdakwa yang di...,Situasi yang Anda alami tentu sangat menyakitk...


In [12]:
csv_path = "/workspace/qwen_generations.csv"
df = pd.read_csv(csv_path)
assert {"question","answer","ground_truth"}.issubset(df.columns), f"Missing columns: {df.columns.tolist()}"

judge_llm = ChatGoogleGenerativeAI(
    model=os.environ.get("GEMINI_MODEL", "gemini-2.5-flash"),
    google_api_key=os.environ["GOOGLE_API_KEY"],
)
llm = LangchainLLMWrapper(judge_llm)
emb = embedding_factory(provider="huggingface",
                        model="sentence-transformers/all-MiniLM-L6-v2")

ragas_ds = Dataset.from_pandas(df[["question","answer","ground_truth"]])


executor = evaluate(
    ragas_ds,
    metrics=[answer_similarity],
    llm=llm,
    embeddings=emb,
    show_progress=True,
    return_executor=True,
)

raw = None
if hasattr(executor, "results"):
    try:
        raw = executor.results()
    except Exception:
        raw = executor.results
else:
    raw = executor

if not isinstance(raw, list):
    raise RuntimeError(f"Expected a list of floats but got {type(raw)}")

per_row_df = pd.DataFrame(raw, columns=["answer_similarity_score"])
print("Per-row dataframe shape:", per_row_df.shape)
print(per_row_df.head(3))

overall_df = per_row_df.mean().to_frame().T
overall_df.columns = [c + "_mean" for c in overall_df.columns]

overall_df.to_csv("/workspace/qwen_ragas_overall.csv", index=False)
per_row_df.to_csv("/workspace/qwen_ragas_per_row.csv", index=False)

print("Saved:")
print("  - /workspace/qwen_ragas_overall.csv")
print("  - /workspace/qwen_ragas_per_row.csv")

overall_df


  llm = LangchainLLMWrapper(judge_llm)
Evaluating: 100%|██████████| 96/96 [00:03<00:00, 29.19it/s]

Per-row dataframe shape: (96, 1)
   answer_similarity_score
0                 0.639990
1                 0.734987
2                 0.775046
Saved:
  - /workspace/qwen_ragas_overall.csv
  - /workspace/qwen_ragas_per_row.csv





Unnamed: 0,answer_similarity_score_mean
0,0.736049
