In [0]:
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

from src.embedder import ClipEmbedder
from src.captioner import BlipCaptioner
from src.vectorstore import FaissStore
from src.rag_inference import build_prompt

In [0]:
df = pd.read_csv("../sample_data/metadata.csv")
img = Image.open("../sample_data/sample_slide.png").convert("RGB")

# Instantiate models
embedder = ClipEmbedder()
captioner = BlipCaptioner()

# Generate embedding + caption
emb = embedder.embed_image(img)
caption = captioner.caption(img)

print("Caption:", caption)
print("Embedding shape:", emb.shape)

In [0]:
# Fake "other slides" (normally you'd load multiple slides)
emb2 = emb + np.random.normal(0, 0.01, emb.shape)  # very similar
emb3 = np.random.normal(0, 1, emb.shape)           # random slide

similarity_matrix = cosine_similarity(np.vstack([emb, emb2, emb3]))
print("Cosine similarity matrix:\n", similarity_matrix)


In [0]:
# Fake "reference summaries"
reference_clinical = "Liver tissue slide, stained with H&E, magnification 40x. Features suggest normal structure."
generated_clinical = "H&E stained liver tissue at 40x magnification, features appear normal."

# BLEU
bleu = sentence_bleu([reference_clinical.split()], generated_clinical.split())
print("BLEU Score:", bleu)

# ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)
scores = scorer.score(reference_clinical, generated_clinical)
print("ROUGE:", scores)

In [0]:
embeddings = np.vstack([emb, emb2, emb3])
labels = ["Slide1", "Slide2 (similar)", "Slide3 (different)"]

tsne = TSNE(n_components=2, random_state=42)
reduced = tsne.fit_transform(embeddings)

plt.figure(figsize=(6,6))
for i, label in enumerate(labels):
    plt.scatter(reduced[i,0], reduced[i,1], label=label)
plt.legend()
plt.title("t-SNE of Slide Embeddings")
plt.show()

In [0]:
# -----------------------------------------------------
# 6. Load Streamlit Logs for Evaluation
# -----------------------------------------------------
log_df = pd.read_csv("../logs/run_log.csv")
print("Loaded", len(log_df), "runs")

# Use the last run for demonstration
last = log_df.iloc[-1]

reference_summary = "Liver tissue slide, stained with H&E at 40x magnification. Features suggest normal histological architecture."
baseline_summary = last["baseline_summary"]
rag_summary = last["rag_summary"]

# Evaluate both
bleu_base, rouge_base = evaluate_summary(reference_summary, baseline_summary)
bleu_rag, rouge_rag = evaluate_summary(reference_summary, rag_summary)

pd.DataFrame([
    {"Model": "Baseline", "BLEU": bleu_base, "ROUGE-1": rouge_base['rouge1'].fmeasure, "ROUGE-L": rouge_base['rougeL'].fmeasure},
    {"Model": "RAG+Metadata", "BLEU": bleu_rag, "ROUGE-1": rouge_rag['rouge1'].fmeasure, "ROUGE-L": rouge_rag['rougeL'].fmeasure}
])


In [0]:
# -----------------------------------------------------
# 7. Load TCGA Enriched Metadata
# -----------------------------------------------------
import os

if os.path.exists("../data/patches_metadata_enriched.csv"):
    print("✅ Using enriched metadata (with diagnosis).")
    df_meta = pd.read_csv("../data/patches_metadata_enriched.csv")
elif os.path.exists("../data/patches_metadata.csv"):
    print("⚠️ Using basic metadata (no diagnosis). Run tcga_metadata_fetcher.py for enrichment.")
    df_meta = pd.read_csv("../data/patches_metadata.csv")
else:
    raise FileNotFoundError("❌ No patch metadata found. Run tcga_preprocess.py first.")


In [0]:
# -----------------------------------------------------
# 8. Join Streamlit logs with metadata
# -----------------------------------------------------
logs = pd.read_csv("../logs/run_log.csv")

if "slide_id" in df_meta.columns:
    eval_df = logs.copy()
    eval_df["slide_id"] = eval_df["metadata"].astype(str).apply(
        lambda x: x.split(",")[0] if "slide_id" in x else None
    )
    eval_df = eval_df.merge(df_meta, on="slide_id", how="left")
else:
    eval_df = logs.copy()
    print("⚠️ No slide_id found in metadata CSV — only logs will be evaluated.")

print("Merged evaluation dataset shape:", eval_df.shape)


In [0]:
# -----------------------------------------------------
# 9. Embedding-based similarity vs ground truth diagnosis
# -----------------------------------------------------
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

if "diagnosis" in df_meta.columns:
    model = SentenceTransformer("all-MiniLM-L6-v2")

    gt_texts = eval_df["diagnosis"].astype(str).tolist()
    baseline_emb = model.encode(eval_df["baseline_summary"].astype(str).tolist())
    rag_emb = model.encode(eval_df["rag_summary"].astype(str).tolist())
    gt_emb = model.encode(gt_texts)

    baseline_scores = [cosine_similarity([b], [g])[0][0] for b, g in zip(baseline_emb, gt_emb)]
    rag_scores = [cosine_similarity([r], [g])[0][0] for r, g in zip(rag_emb, gt_emb)]

    eval_df["baseline_score"] = baseline_scores
    eval_df["rag_score"] = rag_scores

    print("Mean Baseline similarity:", np.mean(baseline_scores))
    print("Mean RAG similarity:", np.mean(rag_scores))
else:
    print("⚠️ No diagnosis column available — cannot compute similarity.")


In [0]:
# -----------------------------------------------------
# 10. Visualization: Baseline vs RAG
# -----------------------------------------------------
if "baseline_score" in eval_df.columns:
    import matplotlib.pyplot as plt

    plt.figure(figsize=(6,4))
    plt.bar(["Baseline", "RAG+Metadata"],
            [np.mean(eval_df["baseline_score"]), np.mean(eval_df["rag_score"])],
            color=["red", "green"])
    plt.title("Average Similarity to Ground Truth Diagnosis")
    plt.ylabel("Cosine Similarity")
    plt.show()


In [0]:
# -----------------------------------------------------
# 11. Qualitative examples
# -----------------------------------------------------
if "diagnosis" in eval_df.columns:
    for i, row in eval_df.head(3).iterrows():
        print("Slide:", row.get("slide_id", "unknown"))
        print("Ground truth diagnosis:", row.get("diagnosis", "N/A"))
        print("Baseline summary:", row["baseline_summary"])
        print("RAG+Metadata summary:", row["rag_summary"])
        print("Baseline similarity:", row.get("baseline_score", "N/A"))
        print("RAG similarity:", row.get("rag_score", "N/A"))
        print("---")


In [0]:
# -----------------------------------------------------
# 12. GPT-based Evaluation (LLM-as-a-Judge)
# -----------------------------------------------------
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    print("⚠️ No OpenAI API key found. Set OPENAI_API_KEY to run this block.")
else:
    def gpt_score(diagnosis, summary):
        """
        Ask GPT to score how well the summary aligns with the ground truth diagnosis.
        Returns a score from 1 (poor alignment) to 5 (excellent alignment).
        """
        prompt = f"""
        You are evaluating pathology report summaries.
        
        Ground truth diagnosis: {diagnosis}
        Candidate summary: {summary}

        On a scale of 1 to 5:
        - 1 = completely wrong or misleading
        - 3 = partially correct but incomplete
        - 5 = fully correct and well aligned with the diagnosis

        Return ONLY the number (1–5).
        """
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role":"user","content":prompt}],
            max_tokens=3,
            temperature=0.0
        )
        try:
            return int(response.choices[0].message.content.strip())
        except:
            return None

    # Apply GPT evaluation for first few samples
    gpt_results = []
    for i, row in eval_df.head(5).iterrows():
        if "diagnosis" in row and isinstance(row["diagnosis"], str):
            base_score = gpt_score(row["diagnosis"], row["baseline_summary"])
            rag_score = gpt_score(row["diagnosis"], row["rag_summary"])
            gpt_results.append({
                "slide_id": row.get("slide_id", "unknown"),
                "diagnosis": row["diagnosis"],
                "baseline_summary": row["baseline_summary"],
                "rag_summary": row["rag_summary"],
                "baseline_gpt_score": base_score,
                "rag_gpt_score": rag_score
            })

    gpt_eval_df = pd.DataFrame(gpt_results)
    display(gpt_eval_df)


In [0]:
# -----------------------------------------------------
# 13. Aggregate GPT Scores + Visualization
# -----------------------------------------------------
if "baseline_gpt_score" in gpt_eval_df.columns:
    avg_base = gpt_eval_df["baseline_gpt_score"].mean()
    avg_rag = gpt_eval_df["rag_gpt_score"].mean()

    print("Average GPT Score (Baseline):", avg_base)
    print("Average GPT Score (RAG+Metadata):", avg_rag)

    # Bar chart comparison
    plt.figure(figsize=(6,4))
    plt.bar(["Baseline", "RAG+Metadata"], [avg_base, avg_rag], color=["red","green"])
    plt.title("Average GPT Evaluation Scores")
    plt.ylabel("Score (1 = poor, 5 = excellent)")
    plt.ylim(0, 5)
    plt.show()
else:
    print("⚠️ No GPT scores computed yet. Run the GPT evaluator block first.")
