In [None]:
import pandas as pd
from advanced_rag import answer_query, llm
from basic_rag import answer_query as basic_answer_query
# from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [None]:

def is_correct(pred: str, gold: str, semantic_model, threshold: float = 0.65) -> bool:
    emb_pred = semantic_model.encode(pred, convert_to_tensor=True)
    emb_gold = semantic_model.encode(gold, convert_to_tensor=True)
    sim = util.cos_sim(emb_pred, emb_gold).item()
    return sim >= threshold

def compute_accuracy(results_df, group_col):
    metrics = results_df.groupby(group_col)["Correct"].mean().reset_index()
    metrics.columns = [group_col, "Accuracy"]
    return metrics


def evaluate_rag(rag_query_fn, questions_df, semantic_model):
    results = []
    for _, row in questions_df.iterrows():
        q, gold, category, trick, difficulty = row["Question"], row["Answer"], row["Category"], row["IsTrick"], row["Difficulty"]
        pred = rag_query_fn(q)
        correct = is_correct(pred, gold, semantic_model)
        results.append({
            "Question": q,
            "Gold Answer": gold,
            "Predicted Answer": pred,
            "Category": category,
            "Difficulty": difficulty,
            "IsTrick": trick,
            "Correct": correct
        })

    results_df = pd.DataFrame(results)
    return results_df

def compute_metrics(results_df):
    category_acc = compute_accuracy(results_df, "Category")
    difficulty_acc = compute_accuracy(results_df, "Difficulty")

    print("\n=== Overall Accuracy ===")
    print(results_df["Correct"].mean())

    print("\n=== Accuracy by Category ===")
    print(category_acc)

    print("\n=== Accuracy by Difficulty ===")
    print(difficulty_acc)

In [None]:

eval_questions_path = "evaluation/1984_test_questions.json"
model = SentenceTransformer('all-MiniLM-L6-v2')

for rag_name, rag in zip(["basic_rag", "advanced_rag", "llm"],[basic_answer_query, answer_query, llm.invoke]):

    df = pd.read_json(eval_questions_path)

    results_df = evaluate_rag(rag_query_fn=rag, questions_df=df, semantic_model=model)

    results_df.to_json(f"evaluation/{rag_name}_eval_results.json", orient="records", indent=2)