In [None]:
import os, pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_similarity
from ragas.embeddings import HuggingFaceEmbeddings

# LangChain OpenAI client pointed to your local vLLM
from langchain_openai import ChatOpenAI
try:
    from ragas.llms.langchain import LangchainLLMWrapper  # newer path
except Exception:
    from ragas.llms import LangchainLLMWrapper

In [None]:
os.environ.setdefault("OPENAI_BASE_URL", "http://127.0.0.1:8002/v1")
os.environ.setdefault("OPENAI_API_KEY", "not-needed")  # set real key only if vLLM started with --api-key

VLLM_URL   = os.getenv("OPENAI_BASE_URL")
MODEL_NAME = os.getenv("VLLM_MODEL", "google/gemma-3-4b-it")


In [None]:
judge_llm = ChatOpenAI(
    model=MODEL_NAME,
    base_url=VLLM_URL,
    api_key=os.environ["OPENAI_API_KEY"],
    temperature=0.2,
)
llm = LangchainLLMWrapper(judge_llm)


In [None]:
emb = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

# ── Load generations CSV (requires columns: question, answer, ground_truth) ────
csv_path = "/workspace/gemma_rag_generations.csv"  # <- or "/workspace/gemma_generations.csv"
df = pd.read_csv(csv_path)
assert {"question","answer","ground_truth"}.issubset(df.columns), \
    f"Missing columns. Have: {df.columns.tolist()}"

# Build RAGAS dataset
ragas_ds = Dataset.from_pandas(df[["question","answer","ground_truth"]])

# ── Evaluate (answer_similarity only) ───────────────────────────────────────────
executor = evaluate(
    ragas_ds,
    metrics=[answer_similarity],
    llm=llm,                # not strictly required for this metric, but safe
    embeddings=emb,
    show_progress=True,
    return_executor=True,
    # Tip: if your vLLM struggles with concurrency, add: batch_size=2
)

# Collect results (list of floats)
try:
    raw = executor.results() if hasattr(executor, "results") else executor
except TypeError:
    raw = executor.results  # some versions expose a property

if not isinstance(raw, list):
    raise RuntimeError(f"Expected a list of floats, got {type(raw)}")

# Per-row and overall
per_row_df = pd.DataFrame(raw, columns=["answer_similarity_score"])
overall_df = per_row_df.mean().to_frame().T
overall_df.columns = [c + "_mean" for c in overall_df.columns]

# Save
overall_out = "/workspace/gemma_ragas_overall.csv"
perrow_out = "/workspace/gemma_ragas_per_row.csv"
overall_df.to_csv(overall_out, index=False)
per_row_df.to_csv(perrow_out, index=False)

print("Saved:")
print(" -", overall_out)
print(" -", perrow_out)
print("Preview:")
display(overall_df)
display(per_row_df.head(3))


In [None]:
import pandas as pd

# 1) per-row similarity (dari raw)
per_row_df = pd.DataFrame(raw, columns=["answer_similarity_score"])

# 2) load time dari CSV generasi (hasil loop inferensi kamu)
gen_path = "/workspace/gemma_rag_generations.csv"
gen_df = pd.read_csv(gen_path)

# Pilih kolom time yang mau dipakai
# Utama: time_sec. Kalau tidak ada, coba time_ms.
if "time_sec" in gen_df.columns:
    time_series = gen_df["time_sec"]
    time_col_name = "time_sec"
elif "time_ms" in gen_df.columns:
    time_series = gen_df["time_ms"]
    time_col_name = "time_ms"
else:
    raise KeyError(f"Kolom time tidak ditemukan di {gen_path}. Kolom yang ada: {list(gen_df.columns)}")

# 3) pastikan jumlah baris sama
if len(per_row_df) != len(time_series):
    raise ValueError(
        f"Row mismatch: similarity rows={len(per_row_df)} vs time rows={len(time_series)}. "
        f"Pastikan raw dan gemma_rag_generations.csv berasal dari dataset yang sama & urutannya sama."
    )

# 4) add time ke per_row_df
per_row_df[time_col_name] = time_series.values

# 5) overall mean (sekarang bisa include mean time juga)
overall_df = per_row_df.mean(numeric_only=True).to_frame().T
overall_df.columns = [c + "_mean" for c in overall_df.columns]

# 6) Save
overall_out = "/workspace/gemma_ragas_overall.csv"
perrow_out  = "/workspace/gemma_ragas_per_row.csv"
overall_df.to_csv(overall_out, index=False)
per_row_df.to_csv(perrow_out, index=False)

print("Saved:")
print(" -", overall_out)
print(" -", perrow_out)
print("Preview:")
display(overall_df)
display(per_row_df.head(3))
