# 🧠 Qwen2.5 Brain MRI Description Generation

Generate medical descriptions for each MRI slice using Qwen2.5 and evaluate with BLEU.

## 1. Setup & Imports

In [None]:
import os, json, base64
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

def encode_image_to_data_uri(path: str) -> str:
    with open(path, "rb") as f:
        b64 = base64.b64encode(f.read()).decode('utf-8')
    return f"data:image/png;base64,{b64}"

DATASET_DIR = "VLM-Seminar25-Dataset/nova_brain"
IMAGES_DIR = os.path.join(DATASET_DIR, "images")
ANNOT_PATH = os.path.join(DATASET_DIR, "annotations.json")
RESULTS_DIR = "../results/nova_brain/descriptions"
os.makedirs(RESULTS_DIR, exist_ok=True)

with open(ANNOT_PATH, "r") as f:
    annotations = json.load(f)
case_ids = list(annotations.keys())

load_dotenv(dotenv_path="../config/user.env")
api_key = os.environ.get("NEBIUS_API_KEY")
client = OpenAI(base_url="https://api.studio.nebius.com/v1/", api_key=api_key)

In [3]:
do_new_inference = True 

## 2. Model Inference

In [4]:
description_results = []
if do_new_inference:
    one_shot = True  # NOTE Set to False if you want to use zero-shot prompting

    for case_id in tqdm(case_ids):
        case = annotations[case_id]
        for img_name, img_info in case.get("image_findings", {}).items():
            img_path = os.path.join(IMAGES_DIR, img_name)
            data_uri = encode_image_to_data_uri(img_path)
            prompt = "Please describe the given medical image as if you are a Doctor trying to analyze. Goal is a very concise and accurate description of the image content, focusing on abnormalities or significant findings. The description should be in English and suitable for a medical report."
            prompt_one_shot = "Please describe the given medical image as if you are a Doctor trying to analyze. Goal is a very concise and accurate description of the image content, focusing on abnormalities or significant findings. The description should be in English and suitable for a medical report. E.g. 'Axial T2 - weighted image showed high signal of the pons.'"
            if one_shot:
                prompt = prompt_one_shot
            
            completion = client.chat.completions.create(
                model="Qwen/Qwen2.5-VL-72B-Instruct",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image_url", "image_url": {"url": data_uri}},
                        ],
                    }
                ],
            )
            pred = completion.choices[0].message.content.strip()
            description_results.append({
                "case_id": case_id,
                "image": img_name,
                "prediction": pred,
                "ground_truth": img_info.get("caption", ""),
                "one_shot": one_shot,
            })

100%|██████████| 25/25 [07:54<00:00, 18.99s/it]


## 3. Save Model Predictions

In [9]:
if not do_new_inference:
    # set one_shot based on the existing results 
    one_shot = False
    print(one_shot)
else:
    print(one_shot)

True


In [11]:
name = "qwen2_5_description_results_one_shot.json" if one_shot else "qwen2_5_description_results_zero_shot.json"
with open(os.path.join(RESULTS_DIR, name), "w") as f:
    json.dump(description_results, f, indent=2)
print("Saved description results.")

Saved description results.


Load results

In [12]:
with open(os.path.join(RESULTS_DIR, name), "r") as f:
    description_results = json.load(f)
print(f"Number of descriptions: {len(description_results)}")

Number of descriptions: 92


In [13]:
description_results

[{'case_id': 'case0061',
  'image': 'case0061_001.png',
  'prediction': 'The provided axial MRI image demonstrates a large, well-defined hypointense mass occupying the central portion of the brain, likely within the ventricular system. This mass appears to have a heterogeneous texture with areas of possible calcification or hemorrhage. Surrounding the mass, there is evidence of perilesional edema, indicated by the adjacent hyperintense signal in the cerebral parenchyma. There is also notable midline shift, suggesting mass effect and potential compression of surrounding structures. Further evaluation with contrast-enhanced imaging and additional sequences may be necessary to characterize the lesion further and determine its etiology.',
  'ground_truth': 'Axial contrast-enhanced T1 3D sequence shows an isointense mass and slight enhancement.',
  'one_shot': True},
 {'case_id': 'case0061',
  'image': 'case0061_002.png',
  'prediction': 'The provided axial MRI image appears to be a T1-weig

## 4. Evaluation & Metrics

### Using BLEU

In [19]:
# Evaluation: Compute BLEU for each image and save as JSON
import sys
sys.path.append("eval_scripts")
from evaluate_bleu import evaluate_bleu
import pandas as pd

gt = [x["ground_truth"] for x in description_results]
pred = [x["prediction"] for x in description_results]

bleu_results = evaluate_bleu(gt, pred)

# Save as JSON
bleu_json_path = os.path.join(RESULTS_DIR, "bleu_per_image.json")
with open(bleu_json_path, "w") as f:
    json.dump(bleu_results, f, indent=2)
print(f"Saved per-image BLEU results to {bleu_json_path}")

# Optionally, convert to DataFrame for further analysis
bleu_df = pd.DataFrame(bleu_results)
bleu_df.head()

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\dgars/nltk_data'
    - 'c:\\Users\\dgars\\miniconda3\\envs\\ml4reg\\nltk_data'
    - 'c:\\Users\\dgars\\miniconda3\\envs\\ml4reg\\share\\nltk_data'
    - 'c:\\Users\\dgars\\miniconda3\\envs\\ml4reg\\lib\\nltk_data'
    - 'C:\\Users\\dgars\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


### Using LLM to evaluate

## 5. Visualize Results: Correct Descriptions (BLEU ≥ 0.5)
Show examples where the BLEU score is high (≥ 0.5).

In [None]:
from PIL import Image

def show_description_examples(examples, title, max_n=4):
    n = min(len(examples), max_n)
    if n == 0:
        print(f"No examples for {title}")
        return
    fig, axes = plt.subplots(1, n, figsize=(6*n, 6))
    if n == 1:
        axes = [axes]
    for i, (img_path, gt, pred, bleu) in enumerate(examples[:n]):
        img = Image.open(img_path).convert("RGB")
        axes[i].imshow(img)
        axes[i].set_title(f"BLEU: {bleu:.2f}", fontsize=14)
        axes[i].axis('off')
        axes[i].text(0, -10, f"GT: {gt}\n\nPred: {pred}", fontsize=10, wrap=True)
    plt.suptitle(title, fontsize=18)
    plt.tight_layout()
    plt.show()

correct = []
for x, bleu in zip(description_results, bleu_scores):
    if bleu >= 0.5:
        img_path = os.path.join(IMAGES_DIR, x["image"])
        correct.append((img_path, x["ground_truth"], x["prediction"], bleu))

show_description_examples(correct, "Correct Descriptions (BLEU ≥ 0.5)")

## 6. Visualize Results: Incorrect Descriptions (BLEU < 0.5)
Show examples where the BLEU score is low (&lt; 0.5).

In [None]:
incorrect = []
for x, bleu in zip(description_results, bleu_scores):
    if bleu < 0.5:
        img_path = os.path.join(IMAGES_DIR, x["image"])
        incorrect.append((img_path, x["ground_truth"], x["prediction"], bleu))

show_description_examples(incorrect, "Incorrect Descriptions (BLEU < 0.5)")