In [5]:
import os
import json
import re
import pandas as pd

import sys

# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.rag_app import get_answer

In [6]:
# Adjust path for QA dataset
qa_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), "..", "data", "qa", "her2_qa_dataset_v2.json"))

In [7]:
# Load QA dataset
with open(qa_path, "r") as f:
    qa_data = json.load(f)


In [8]:
# Helper functions for metrics
def normalize(text):
    return re.sub(r"\W+", " ", text.lower()).strip()

def exact_match(pred, ref):
    return int(normalize(pred) == normalize(ref))

def f1_score(pred, ref):
    pred_tokens = normalize(pred).split()
    ref_tokens = normalize(ref).split()
    common = set(pred_tokens) & set(ref_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(ref_tokens)
    return 2 * precision * recall / (precision + recall)

In [12]:
# Evaluate
records = []

for i, item in enumerate(qa_data):
    question = item["question"]
    reference = item["answer"]
    try:
        # Evaluation version skips chat history to avoid extra formatting
        prediction = get_answer(question, history=[])
        # Truncate to answer only if any formatting is accidentally included
        if "User:" in prediction:
            prediction = prediction.split("User:")[0].strip()
    except Exception as e:
        prediction = f"ERROR: {e}"

    records.append({
        "#": i + 1,
        "Question": question,
        "Reference Answer": reference,
        "Predicted Answer": prediction,
        "EM": exact_match(prediction, reference),
        "F1": f1_score(prediction, reference)
    })



In [13]:
# Create and display DataFrame
df = pd.DataFrame(records)
avg_em = df["EM"].mean()
avg_f1 = df["F1"].mean()

print(f"\n✅ Average Exact Match: {avg_em:.2f}")
print(f"✅ Average F1 Score: {avg_f1:.2f}")
df[["#", "EM", "F1"]]



✅ Average Exact Match: 0.00
✅ Average F1 Score: 0.13


Unnamed: 0,#,EM,F1
0,1,0,0.255319
1,2,0,0.066667
2,3,0,0.12987
3,4,0,0.173913
4,5,0,0.035088
5,6,0,0.051282
6,7,0,0.0
7,8,0,0.230769
8,9,0,0.105263
9,10,0,0.222222


In [14]:
# Save results
df.to_csv("../data/qa/her2_eval_predictions.csv", index=False)