# üìä MedGemma Leukemia Detection - Evaluation V2 (Fixed)

This notebook evaluates the fine-tuned model `good2idnan/medgemma-1.5-4b-it-leukemia-lora`.
**Correction:** We now use the **Preliminary Validation Data** (Folder: `validation_data`) which was NOT used in training.
This ensures a truly unbiased test.

In [None]:
# 1. Install Dependencies
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U peft bitsandbytes accelerate kagglehub scikit-learn seaborn matplotlib

In [None]:
import os
import kagglehub
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from peft import PeftModel
from google.colab import userdata
from tqdm import tqdm
from PIL import Image
import pandas as pd
import torch.nn.functional as F

# Setup Secrets (Colab)
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
    os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
except:
    print("‚ö†Ô∏è Secrets not found. Ensure HF_TOKEN, KAGGLE_USERNAME, KAGGLE_KEY are set if needed.")

print("‚úÖ Libraries Loaded")

In [None]:
# 2. Download Validation Data (C-NMC)
print("‚¨áÔ∏è Downloading C-NMC Dataset...")
path = kagglehub.dataset_download("andrewmvd/leukemia-classification")
print(f"üìÇ Dataset Root: {path}")

In [None]:
# 3. Prepare Validation List (Preliminary Test Data)
# The folder "validation_data" (C-NMC_test_prelim_phase_data) contains images.
# The labels are in a CSV file: "C-NMC_test_prelim_phase_data_labels.csv"

val_images = []
val_labels = []

# 1. Define paths
base_folder = Path(path) / "C-NMC_Leukemia" / "validation_data"

# Locate the inner folder (C-NMC_test_prelim_phase_data) and the CSV
img_folder = list(base_folder.glob("C-NMC_test_prelim_phase_data"))[0]
csv_file = list(base_folder.glob("*.csv"))[0]

print(f"üìÇ Image Folder: {img_folder}")
print(f"üìÑ Label File: {csv_file}")

# 2. Read CSV Labels
df = pd.read_csv(csv_file)
print(f"   CSV loaded: {len(df)} rows. Columns: {df.columns.tolist()}")

# 3. Match Images to Labels
if 'new_names' in df.columns and 'labels' in df.columns:
    name_col = 'new_names'
    label_col = 'labels'
else:
     name_col = df.columns[0]
     label_col = df.columns[-1]

for index, row in df.iterrows():
    fname = row[name_col]
    label = int(row[label_col])
    
    img_path = img_folder / fname
    
    if img_path.exists():
        val_images.append(str(img_path))
        val_labels.append(label)

class_names = {0: "Normal", 1: "Leukemia"}
print(f"\n‚úÖ Validation Set Ready: {len(val_images)} images matched.")

In [None]:
# 4. Load Model (V2)
BASE_MODEL_ID = "google/medgemma-1.5-4b-it"
LORA_ADAPTER_ID = "good2idnan/medgemma-1.5-4b-it-leukemia-lora"

processor = AutoProcessor.from_pretrained(LORA_ADAPTER_ID, token=HF_TOKEN)
base_model = AutoModelForImageTextToText.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=HF_TOKEN
)
model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_ID, token=HF_TOKEN)
model.eval()
print("‚úÖ Fine-Tuned Model Loaded")

In [None]:
# 5. Scoring Function
def build_predict_prompt(processor, image):
    user_text = (
        "Classify this blood cell microscopy image.\n"
        "Answer with exactly ONE word: Normal or Leukemia.\n"
        "Answer:"
    )
    messages = [
        {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": user_text}]}
    ]
    return processor.apply_chat_template(messages, add_generation_prompt=True)

def get_prediction(image_path):
    image = Image.open(image_path).convert("RGB")
    prompt = build_predict_prompt(processor, [image])
    inputs = processor(text=[prompt], images=[image], return_tensors="pt").to(model.device)
    
    candidates = ["Normal", "Leukemia"]
    candidate_tokens = [processor.tokenizer.encode(cand, add_special_tokens=False)[0] for cand in candidates]
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0, -1, :]
        
    scores = [logits[token].item() for token in candidate_tokens]
    best_idx = scores.index(max(scores))
    pred_class = candidates[best_idx]
    confidence = torch.softmax(torch.tensor(scores), dim=0)[best_idx].item()
    
    return 0 if pred_class == "Normal" else 1, confidence

# Test on one
p, c = get_prediction(val_images[0])
print(f"Test Prediction: {class_names[p]} ({c:.2%})")

In [None]:
# 6. Run Evaluation Loop
predictions = []
true_labels = []

print(f"üöÄ Running evaluation on {len(val_images)} images...")

# If too many images, limit to first 300 for quick feedback
for img_path, label in tqdm(zip(val_images[:300], val_labels[:300]), total=min(len(val_images), 300)):
    try:
        pred_idx, conf = get_prediction(img_path)
        predictions.append(pred_idx)
        true_labels.append(label)
    except Exception as e:
        print(f"Error processing {img_path}: {e}")

print("‚úÖ Evaluation Complete")

In [None]:
# 7. Generate Report
report = classification_report(true_labels, predictions, target_names=["Normal", "Leukemia"], output_dict=True)
df_report = pd.DataFrame(report).transpose()
print("\nüìä Classification Report:")
print(df_report)

# Confusion Matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Normal", "Leukemia"], yticklabels=["Normal", "Leukemia"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Validation Data')
plt.show()

df_results = pd.DataFrame({
    "Image": val_images[:len(predictions)],
    "True Label": [class_names[t] for t in true_labels],
    "Predicted": [class_names[p] for p in predictions]
})
df_results.to_csv("evaluation_results_v2_FIXED.csv", index=False)
print("üíæ Saved to evaluation_results_v2_FIXED.csv")