# 🫁 Qwen2.5 Chest X-ray Abnormality Grounding

Locate abnormality areas in chest X-rays using Qwen2.5 and evaluate with mAP.

## 1. Setup & Imports

In [1]:
import os, json, base64
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
%matplotlib inline

def encode_image_to_data_uri(path: str) -> str:
    with open(path, "rb") as f:
        b64 = base64.b64encode(f.read()).decode('utf-8')
    return f"data:image/png;base64,{b64}"

DATASET_DIR = "VLM-Seminar25-Dataset/chest_xrays"
IMAGES_DIR = os.path.join(DATASET_DIR, "images")
ANNOT_PATH = os.path.join(DATASET_DIR, "annotations_len_50.json")
RESULTS_DIR = "../results/chest_xrays/grounding"

os.makedirs(RESULTS_DIR, exist_ok=True)

with open(ANNOT_PATH, "r") as f:
    annotations = json.load(f)
image_ids = list(annotations.keys())
print(f"Number of images in the dataset: {len(image_ids)}")
print(f"Number of images in the dataset: {len(list(set(image_ids)))}")

load_dotenv(dotenv_path="../config/user.env")
api_key = os.environ.get("NEBIUS_API_KEY")
client = OpenAI(base_url="https://api.studio.nebius.com/v1/", api_key=api_key)

Number of images in the dataset: 50
Number of images in the dataset: 50


In [2]:
do_new_inference = False

## 2. Model Inference
Only done for unhealthy xrays!

In [3]:
grounding_results = []
if do_new_inference:
    for img_id in tqdm(image_ids):
        print()
        ann = annotations[img_id]
        if ann["status"] == "healthy" or not ann.get("bbox_2d") or ann.get("bbox_2d") == []:
            continue
        # Get all unique diseases from bbox_2d (5th element in each box)
        if ann["bbox_2d"] < 5:
            print(f"Skipping {img_id} due to insufficient bbox_2d data.")
            continue
        diseases_in_image = set([box[4] for box in ann["bbox_2d"]])
        for disease in diseases_in_image:
            img_path = os.path.join(IMAGES_DIR, img_id + ".png")
            data_uri = encode_image_to_data_uri(img_path)
            prompt = f"Please locate {disease} and output bounding boxes as floats [x1, y1, x2, y2]. Output nothing else."
            completion = client.chat.completions.create(
                model="Qwen/Qwen2.5-VL-72B-Instruct",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image_url", "image_url": {"url": data_uri}},
                        ],
                    }
                ],
            )
            pred = completion.choices[0].message.content.strip()
            print(pred)
            grounding_results.append({"id": img_id, "disease": disease, "prediction": pred})

Interesting prompting note: Without  "If the disease is not present, output '[]'." it outputs coords, otherwise often not!

## 3. Save Model Predictions

In [None]:
if do_new_inference:
    with open(os.path.join(RESULTS_DIR, "qwen2.5_grounding_results_per_disease.json"), "w") as f:
        json.dump(grounding_results, f, indent=2)
    print("Saved grounding results.")

Saved grounding results.


**Or Load already given results**

In [4]:
with open(os.path.join(RESULTS_DIR, "qwen2.5_grounding_results_per_disease.json"), "r") as f:
    grounding_results = json.load(f)
print(f"Number of grounding results: {len(grounding_results)}")

Number of grounding results: 64


## 4. Evaluation & Metrics

In [42]:
sys.path.append("eval_scripts")
from calculate_map import compute_map_supervision, draw_boxes



### 4.1 Prepare Ground Truth and Predictions per Disease

We will organize the ground truth and predicted bounding boxes per disease for evaluation.

In [None]:
# Prepare ground truth and predictions per disease and per image
disease_set = set()
true_boxes_per_img_disease = {}
pred_boxes_per_img_disease = {}
true_boxes_per_img = {}
pred_boxes_per_img = {}

# Collect all diseases
for img_id, ann in annotations.items():
    if ann.get("global_disease"):
        for disease in ann["global_disease"]:
            disease_set.add(disease)
disease_list = sorted(list(disease_set))
disease2id = {d: i for i, d in enumerate(disease_list)}

# Ground truth per image/disease
for img_id, ann in annotations.items():
    if ann.get("global_disease") and ann.get("boxes"):
        for disease, box in zip(ann["global_disease"], ann["boxes"]):
            true_boxes_per_img_disease.setdefault((img_id, disease), []).append(box)
            true_boxes_per_img.setdefault(img_id, []).append(box)

# Predictions per image/disease
for res in grounding_results:
    img_id = res["id"]
    disease = res["disease"]
    try:
        pred_boxes = json.loads(res["prediction"])
        if isinstance(pred_boxes[0], (int, float)):
            pred_boxes = [pred_boxes]
    except Exception:
        pred_boxes = []
    pred_boxes_per_img_disease.setdefault((img_id, disease), []).extend(pred_boxes)
    pred_boxes_per_img.setdefault(img_id, []).extend(pred_boxes)

print(f"Diseases: {disease_list}")
print(f"Example true boxes for first disease: {list(true_boxes_per_img_disease.items())[0]}")

### 4.2 Compute mAP per Disease

For each disease, we compute the mean Average Precision (mAP) across all images, considering only the bounding boxes and predictions for that disease. This helps to understand model performance for each abnormality type.

In [None]:
# Compute mAP per disease
disease_map_results = {}
for disease in disease_list:
    all_true_boxes = []
    all_true_classes = []
    all_pred_boxes = []
    all_pred_classes = []
    for img_id in image_ids:
        # GT
        gt_boxes = true_boxes_per_img_disease.get((img_id, disease), [])
        all_true_boxes.extend(gt_boxes)
        all_true_classes.extend([disease2id[disease]] * len(gt_boxes))
        # Pred
        pred_boxes = pred_boxes_per_img_disease.get((img_id, disease), [])
        all_pred_boxes.extend(pred_boxes)
        all_pred_classes.extend([disease2id[disease]] * len(pred_boxes))
    result = compute_map_supervision(all_pred_boxes, all_pred_classes, all_true_boxes, all_true_classes)
    disease_map_results[disease] = result.map
    print(f"mAP for {disease}: {result.map:.4f}")

# Save per-disease mAP
with open(os.path.join(RESULTS_DIR, "per_disease_map.json"), "w") as f:
    json.dump(disease_map_results, f, indent=2)

### 4.3 Compute Overall mAP

Here, we compute the overall mean Average Precision (mAP) across all diseases and all images, providing a single summary metric for the model's localization performance.

In [None]:
# Compute overall mAP (all diseases, all images)
all_true_boxes = []
all_true_classes = []
all_pred_boxes = []
all_pred_classes = []
for img_id in image_ids:
    for disease in disease_list:
        gt_boxes = true_boxes_per_img_disease.get((img_id, disease), [])
        all_true_boxes.extend(gt_boxes)
        all_true_classes.extend([disease2id[disease]] * len(gt_boxes))
        pred_boxes = pred_boxes_per_img_disease.get((img_id, disease), [])
        all_pred_boxes.extend(pred_boxes)
        all_pred_classes.extend([disease2id[disease]] * len(pred_boxes))

overall_result = compute_map_supervision(all_pred_boxes, all_pred_classes, all_true_boxes, all_true_classes)
print(f"Overall mAP: {overall_result.map:.4f}")

with open(os.path.join(RESULTS_DIR, "overall_map.json"), "w") as f:
    json.dump({"overall_map": overall_result.map}, f, indent=2)

### 4.4 Compute mAP per Image

For each image, we compute the mAP considering all ground truth and predicted boxes (across all diseases) in that image. This helps to identify images where the model performs well or struggles.

In [None]:
# Compute and plot mAP per image (all diseases in each image)
per_image_map = {}
for img_id in image_ids:
    gt_boxes = true_boxes_per_img.get(img_id, [])
    pred_boxes = pred_boxes_per_img.get(img_id, [])
    # For classes, assign 0 to all (or use disease2id if you want to distinguish)
    gt_classes = [0] * len(gt_boxes)
    pred_classes = [0] * len(pred_boxes)
    result = compute_map_supervision(pred_boxes, pred_classes, gt_boxes, gt_classes)
    per_image_map[img_id] = result.map
    print(f"Image {img_id}: mAP = {result.map:.4f}")
    # Plot boxes for a few images
    if len(gt_boxes) > 0 or len(pred_boxes) > 0:
        print(f"Plotting boxes for image {img_id}")
        draw_boxes(pred_boxes, gt_boxes)

# Save per-image mAP
with open(os.path.join(RESULTS_DIR, "per_image_map.json"), "w") as f:
    json.dump(per_image_map, f, indent=2)

### 4.4 Further mAP-based Analyses and Plots

To further analyze model performance, consider:
- Plotting mAP vs. disease frequency (how common each disease is in the dataset)
- Plotting mAP vs. bounding box size (are small/large abnormalities harder to detect?)
- Precision-Recall (PR) curves per disease
- Confusion matrix for disease classification (if applicable)
- Visualizing best/worst performing images or diseases
- Analyzing mAP by image quality or other metadata

These analyses can help identify strengths and weaknesses of the model and guide further improvements.