In [0]:
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

from src.embedder import ClipEmbedder
from src.captioner import BlipCaptioner
from src.vectorstore import FaissStore
from src.rag_inference import build_prompt

In [0]:
df = pd.read_csv("../sample_data/metadata.csv")
img = Image.open("../sample_data/sample_slide.png").convert("RGB")

# Instantiate models
embedder = ClipEmbedder()
captioner = BlipCaptioner()

# Generate embedding + caption
emb = embedder.embed_image(img)
caption = captioner.caption(img)

print("Caption:", caption)
print("Embedding shape:", emb.shape)

In [0]:
# Fake "other slides" (normally you'd load multiple slides)
emb2 = emb + np.random.normal(0, 0.01, emb.shape)  # very similar
emb3 = np.random.normal(0, 1, emb.shape)           # random slide

similarity_matrix = cosine_similarity(np.vstack([emb, emb2, emb3]))
print("Cosine similarity matrix:\n", similarity_matrix)


In [0]:
# Fake "reference summaries"
reference_clinical = "Liver tissue slide, stained with H&E, magnification 40x. Features suggest normal structure."
generated_clinical = "H&E stained liver tissue at 40x magnification, features appear normal."

# BLEU
bleu = sentence_bleu([reference_clinical.split()], generated_clinical.split())
print("BLEU Score:", bleu)

# ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)
scores = scorer.score(reference_clinical, generated_clinical)
print("ROUGE:", scores)

In [0]:
embeddings = np.vstack([emb, emb2, emb3])
labels = ["Slide1", "Slide2 (similar)", "Slide3 (different)"]

tsne = TSNE(n_components=2, random_state=42)
reduced = tsne.fit_transform(embeddings)

plt.figure(figsize=(6,6))
for i, label in enumerate(labels):
    plt.scatter(reduced[i,0], reduced[i,1], label=label)
plt.legend()
plt.title("t-SNE of Slide Embeddings")
plt.show()

In [0]:
# -----------------------------------------------------
# 6. Load Streamlit Logs for Evaluation
# -----------------------------------------------------
log_df = pd.read_csv("../logs/run_log.csv")
print("Loaded", len(log_df), "runs")

# Use the last run for demonstration
last = log_df.iloc[-1]

reference_summary = "Liver tissue slide, stained with H&E at 40x magnification. Features suggest normal histological architecture."
baseline_summary = last["baseline_summary"]
rag_summary = last["rag_summary"]

# Evaluate both
bleu_base, rouge_base = evaluate_summary(reference_summary, baseline_summary)
bleu_rag, rouge_rag = evaluate_summary(reference_summary, rag_summary)

pd.DataFrame([
    {"Model": "Baseline", "BLEU": bleu_base, "ROUGE-1": rouge_base['rouge1'].fmeasure, "ROUGE-L": rouge_base['rougeL'].fmeasure},
    {"Model": "RAG+Metadata", "BLEU": bleu_rag, "ROUGE-1": rouge_rag['rouge1'].fmeasure, "ROUGE-L": rouge_rag['rougeL'].fmeasure}
])


In [0]:
# -----------------------------------------------------
# 7. Load TCGA Enriched Metadata
# -----------------------------------------------------
import os

if os.path.exists("../data/patches_metadata_enriched.csv"):
    print("✅ Using enriched metadata (with diagnosis).")
    df_meta = pd.read_csv("../data/patches_metadata_enriched.csv")
elif os.path.exists("../data/patches_metadata.csv"):
    print("⚠️ Using basic metadata (no diagnosis). Run tcga_metadata_fetcher.py for enrichment.")
    df_meta = pd.read_csv("../data/patches_metadata.csv")
else:
    raise FileNotFoundError("❌ No patch metadata found. Run tcga_preprocess.py first.")
