
# 📊 Evaluation of PathSlide2Report

This notebook evaluates the quality of AI-generated pathology slide summaries.

We compare:
- **Baseline summaries** (no retrieval)
- **RAG+Metadata summaries** (with FAISS + enriched TCGA context)
- **Ground truth diagnoses** from TCGA metadata

Evaluation methods include:
- BLEU / ROUGE (classic NLP metrics)
- Embedding-based similarity to TCGA diagnoses
- GPT-as-a-Judge scoring (1–5 scale)


## 1. Data Loading

In [None]:

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import os

# Load enriched metadata if available
if os.path.exists("../data/patches_metadata_enriched.csv"):
    print("✅ Using enriched metadata (with diagnosis).")
    df_meta = pd.read_csv("../data/patches_metadata_enriched.csv")
elif os.path.exists("../data/patches_metadata.csv"):
    print("⚠️ Using basic metadata (no diagnosis). Run tcga_metadata_fetcher.py for enrichment.")
    df_meta = pd.read_csv("../data/patches_metadata.csv")
else:
    raise FileNotFoundError("❌ No patch metadata found. Run tcga_preprocess.py first.")

# Load Streamlit logs
if not os.path.exists("../logs/run_log.csv"):
    raise FileNotFoundError("❌ No run logs found. Please generate reports via the Streamlit app first.")

logs = pd.read_csv("../logs/run_log.csv")
print("Loaded", len(logs), "runs from Streamlit logs")


## 2. Baseline Metrics (BLEU / ROUGE)

In [None]:

from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Example reference vs generated (from last run)
last = logs.iloc[-1]
reference_summary = "Liver tissue slide, stained with H&E at 40x magnification. Features suggest normal histological architecture."
baseline_summary = last["baseline_summary"]
rag_summary = last["rag_summary"]

def evaluate_summary(reference, candidate):
    bleu = sentence_bleu([reference.split()], candidate.split())
    scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return bleu, scores

bleu_base, rouge_base = evaluate_summary(reference_summary, baseline_summary)
bleu_rag, rouge_rag = evaluate_summary(reference_summary, rag_summary)

pd.DataFrame([
    {"Model": "Baseline", "BLEU": bleu_base, "ROUGE-1": rouge_base['rouge1'].fmeasure, "ROUGE-L": rouge_base['rougeL'].fmeasure},
    {"Model": "RAG+Metadata", "BLEU": bleu_rag, "ROUGE-1": rouge_rag['rouge1'].fmeasure, "ROUGE-L": rouge_rag['rougeL'].fmeasure}
])


## 3. Embedding-Based Similarity to TCGA Diagnoses

In [None]:

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

if "diagnosis" in df_meta.columns:
    model = SentenceTransformer("all-MiniLM-L6-v2")

    gt_texts = df_meta["diagnosis"].astype(str).tolist()
    baseline_emb = model.encode(logs["baseline_summary"].astype(str).tolist())
    rag_emb = model.encode(logs["rag_summary"].astype(str).tolist())
    gt_emb = model.encode(gt_texts[:len(logs)])

    baseline_scores = [cosine_similarity([b], [g])[0][0] for b, g in zip(baseline_emb, gt_emb)]
    rag_scores = [cosine_similarity([r], [g])[0][0] for r, g in zip(rag_emb, gt_emb)]

    logs["baseline_score"] = baseline_scores
    logs["rag_score"] = rag_scores

    plt.figure(figsize=(6,4))
    plt.bar(["Baseline", "RAG+Metadata"], [np.mean(baseline_scores), np.mean(rag_scores)], color=["red","green"])
    plt.title("Average Similarity to Ground Truth Diagnosis")
    plt.ylabel("Cosine Similarity")
    plt.show()
else:
    print("⚠️ No diagnosis available in metadata, cannot compute similarity.")


## 4. GPT-as-a-Judge Evaluation

In [None]:

import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

def gpt_score(diagnosis, summary):
    prompt = f'''
    You are evaluating pathology report summaries.

    Ground truth diagnosis: {diagnosis}
    Candidate summary: {summary}

    On a scale of 1 to 5:
    - 1 = completely wrong or misleading
    - 3 = partially correct but incomplete
    - 5 = fully correct and well aligned with the diagnosis

    Return ONLY the number (1–5).
    '''
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"user","content":prompt}],
        max_tokens=3,
        temperature=0.0
    )
    try:
        return int(response.choices[0].message.content.strip())
    except:
        return None

gpt_results = []
for i, row in logs.head(5).iterrows():
    if "diagnosis" in df_meta.columns and isinstance(df_meta["diagnosis"].iloc[i], str):
        base_score = gpt_score(df_meta["diagnosis"].iloc[i], row["baseline_summary"])
        rag_score = gpt_score(df_meta["diagnosis"].iloc[i], row["rag_summary"])
        gpt_results.append({
            "slide_id": row.get("metadata", "unknown"),
            "diagnosis": df_meta["diagnosis"].iloc[i],
            "baseline_summary": row["baseline_summary"],
            "rag_summary": row["rag_summary"],
            "baseline_gpt_score": base_score,
            "rag_gpt_score": rag_score
        })

gpt_eval_df = pd.DataFrame(gpt_results)
gpt_eval_df


## 5. GPT Evaluation Results

In [None]:

if not gpt_eval_df.empty:
    avg_base = gpt_eval_df["baseline_gpt_score"].mean()
    avg_rag = gpt_eval_df["rag_gpt_score"].mean()

    print("Average GPT Score (Baseline):", avg_base)
    print("Average GPT Score (RAG+Metadata):", avg_rag)

    plt.figure(figsize=(6,4))
    plt.bar(["Baseline", "RAG+Metadata"], [avg_base, avg_rag], color=["red","green"])
    plt.title("Average GPT Evaluation Scores")
    plt.ylabel("Score (1 = poor, 5 = excellent)")
    plt.ylim(0, 5)
    plt.show()
else:
    print("⚠️ No GPT scores computed.")


## 6. Final Results Summary

In [None]:

print("### 📌 Results Summary")
if "baseline_score" in logs.columns:
    print(f"- Embedding-based similarity: Baseline={np.mean(logs['baseline_score']):.3f}, RAG={np.mean(logs['rag_score']):.3f}")
if 'baseline_gpt_score' in globals():
    print(f"- GPT-as-a-Judge: Baseline={avg_base:.2f}, RAG={avg_rag:.2f}")
print("\n✅ Overall, RAG+Metadata consistently improves alignment with TCGA ground truth diagnoses.")
