In [None]:
# %%
import os, json, requests, time
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import answer_similarity
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings.base import embedding_factory
from langchain_google_genai import ChatGoogleGenerativeAI


In [None]:
# %%
# ✅ SET KEY DI ENV, JANGAN HARDCODE
# di terminal / notebook before run:
# export GOOGLE_API_KEY="....."
assert os.environ.get("GOOGLE_API_KEY"), "Set GOOGLE_API_KEY in env (jangan hardcode di notebook)."
print("GOOGLE_API_KEY exists ✅")


In [None]:
# %%
# ---- Config untuk Qwen ----
os.environ["DATA_PATH"]   = "/workspace/QAs_Hukumonline_Test.json"
os.environ["VLLM_BASE"]   = "http://127.0.0.1:8002"

# model base vLLM (Qwen)
os.environ["VLLM_MODEL"]  = "Qwen/Qwen2.5-3B-Instruct"

# nama LoRA module yang kamu register di vLLM (contoh: "hukum_qwen")
# pastikan sama persis dengan yang ada di server vLLM kamu
os.environ["LORA_NAME"]   = "hukum_qwen"

# Gemini judge model
os.environ["GEMINI_MODEL"] = "gemini-2.5-flash"

print("Config ready")
print("VLLM:", os.environ["VLLM_BASE"], os.environ["VLLM_MODEL"], "LoRA:", os.environ["LORA_NAME"])
print("GEMINI:", os.environ["GEMINI_MODEL"])


In [None]:
# %%
# Load test data
with open(os.environ["DATA_PATH"], "r", encoding="utf-8") as f:
    raw = json.load(f)

df = pd.DataFrame(raw).rename(columns={"instruction":"question","response":"ground_truth"})
print(f"Rows: {len(df)}")
df.head(3)


In [None]:
# %%
# Generate answers using vLLM (Qwen 2.5 3B + LoRA)
VLLM_URL = os.environ["VLLM_BASE"]
MODEL    = os.environ["VLLM_MODEL"]
LORA     = os.environ["LORA_NAME"]

answers = []
start = time.time()

for q in tqdm(df["question"].tolist(), desc="Generating answers (Qwen2.5-3B + LoRA)"):
    payload = {
        "model": MODEL,
        "messages": [
            # Optional system prompt biar konsisten
            {"role":"system","content":"Anda adalah asisten hukum yang akurat, padat, dan mengutip dasar hukum bila relevan."},
            {"role":"user","content": q}
        ],
        "temperature": 0.2,
        "max_tokens": 1280,
        # vLLM: apply LoRA
        "extra_body": {"lora_modules": [LORA]},
    }

    r = requests.post(f"{VLLM_URL}/v1/chat/completions", json=payload, timeout=300)
    r.raise_for_status()
    answers.append(r.json()["choices"][0]["message"]["content"])

end = time.time()
elapsed = end - start

df["answer"] = answers

out_csv = "/workspace/qwen_generations_lora.csv"
df.to_csv(out_csv, index=False)
print("Saved", out_csv)

print(f"Total time: {elapsed:.2f} seconds")
print(f"Average per question: {elapsed / max(1,len(df)):.2f} seconds")

df.head(3)


In [None]:
# %%
# RAGAS answer_similarity evaluation (Gemini as judge + HF embedding)
csv_path = "/workspace/qwen_generations_lora.csv"
df = pd.read_csv(csv_path)
assert {"question","answer","ground_truth"}.issubset(df.columns), f"Missing columns: {df.columns.tolist()}"

judge_llm = ChatGoogleGenerativeAI(
    model=os.environ["GEMINI_MODEL"],
    google_api_key=os.environ["GOOGLE_API_KEY"],
)
llm = LangchainLLMWrapper(judge_llm)

emb = embedding_factory(
    provider="huggingface",
    model="sentence-transformers/all-MiniLM-L6-v2"
)

ragas_ds = Dataset.from_pandas(df[["question","answer","ground_truth"]])

executor = evaluate(
    ragas_ds,
    metrics=[answer_similarity],
    llm=llm,
    embeddings=emb,
    show_progress=True,
    return_executor=True,
)

raw = None
if hasattr(executor, "results"):
    try:
        raw = executor.results()
    except Exception:
        raw = executor.results
else:
    raw = executor

if not isinstance(raw, list):
    raise RuntimeError(f"Expected a list of floats but got {type(raw)}")

per_row_df = pd.DataFrame(raw, columns=["answer_similarity_score"])
print("Per-row dataframe shape:", per_row_df.shape)
print(per_row_df.head(3))

overall_df = per_row_df.mean().to_frame().T
overall_df.columns = [c + "_mean" for c in overall_df.columns]

overall_df.to_csv("/workspace/qwen_ragas_overall_lora.csv", index=False)
per_row_df.to_csv("/workspace/qwen_ragas_per_row_lora.csv", index=False)

print("Saved:")
print("  - /workspace/qwen_ragas_overall_lora.csv")
print("  - /workspace/qwen_ragas_per_row_lora.csv")

overall_df


In [None]:
# %%
# Text stats summary (avg words, TTR, empties, duplicates) for Qwen LoRA
import re
from collections import Counter

paths = {
    "Qwen2.5-3B-Instruct-LoRA": "/workspace/qwen_generations_lora.csv",
}

def text_stats(df):
    answers = df["answer"].fillna("").astype(str)

    word_counts = answers.apply(lambda t: len(re.findall(r"\w+(?:'\w+)?", t)))
    avg_words = word_counts.mean()

    def ttr(text):
        tokens = re.findall(r"\w+(?:'\w+)?", text.lower())
        return len(set(tokens)) / len(tokens) if tokens else 0

    ttrs = answers.apply(ttr)
    avg_ttr = ttrs.mean()

    empties_pct = 100 * (answers.str.strip() == "").mean()
    duplicate_pct = 100 * (1 - answers.nunique() / len(answers)) if len(answers) else 0

    return {
        "avg_words": avg_words,
        "avg_type_token_ratio": avg_ttr,
        "answer_empties_%": empties_pct,
        "exact_duplicate_answers_%": duplicate_pct,
        "n_rows": len(answers)
    }

rows = []
for name, path in paths.items():
    try:
        dfm = pd.read_csv(path)
        if "answer" not in dfm.columns:
            raise ValueError(f"'answer' column not found in {path}")
        metrics = text_stats(dfm)
        metrics["model"] = name
        rows.append(metrics)
    except Exception as e:
        print(f"[skip] {name}: {e}")

summary = pd.DataFrame(rows)[
    ["model", "n_rows", "avg_words", "avg_type_token_ratio", "answer_empties_%", "exact_duplicate_answers_%"]
]

print(summary.round(3))

summary.to_csv("/workspace/model_text_metrics_summary_qwen_lora.csv", index=False)
print("\nSaved /workspace/model_text_metrics_summary_qwen_lora.csv")
