In [3]:
# Test Fine-Tuned BLIP Model and Save Results

In [4]:
# Step 1: Import Required Libraries
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import load_dataset
import matplotlib.pyplot as plt

In [5]:
# Step 2: Load Fine-Tuned Model
print("Loading fine-tuned BLIP model...")
processor = BlipProcessor.from_pretrained("./blip-finetuned")
model = BlipForConditionalGeneration.from_pretrained("./blip-finetuned")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading fine-tuned BLIP model...


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [6]:
# Step 3: Load Test Dataset
print("Loading test dataset...")
dataset = load_dataset("eltorio/ROCOv2-radiology")
test_data = dataset["test"]

Loading test dataset...


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/27 [00:00<?, ?it/s]

In [7]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Step 4: Generate Captions for Test Images Without Displaying Images
from tqdm import tqdm
model.eval()
results = []

print("Generating captions for test images...")
for idx, example in enumerate(tqdm(test_data)):
    image = example["image"]
    true_caption = example["caption"]

    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Generate caption
    with torch.no_grad():
        output = model.generate(**inputs)
    generated_caption = processor.decode(output[0], skip_special_tokens=True)

    # Save result
    results.append({
        "Index": idx,
        "Generated Caption": generated_caption,
        "True Caption": true_caption
    })

Generating captions for test images...


 62%|█████████████████████████████████████████████▋                            | 6125/9927 [5:13:29<3:14:35,  3.07s/it]


RuntimeError: [enforce fail at alloc_cpu.cpp:116] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1772544 bytes.

In [9]:
import json

# Save partial results now
with open("partial_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("✅ Saved current results into partial_results.json")

✅ Saved current results into partial_results.json


In [14]:
import json
from tqdm import tqdm
import torch

# فرض میکنیم model و processor و device آماده هستن

# 🧠 تعداد نمونه‌های قبلی پردازش شده رو بخون
with open('partial_results.json', 'r', encoding='utf-8') as f:
    previous_results = json.load(f)

start_idx = len(previous_results)

print(f"🚀 Continuing caption generation from index {start_idx}...")

# فقط داده‌های باقیمانده (با متد مخصوص datasets)
test_data_remaining = test_data.select(range(start_idx, len(test_data)))

new_results = []

model.eval()
for idx_offset, example in enumerate(tqdm(test_data_remaining)):
    idx = start_idx + idx_offset  # اندیس واقعی
    image = example["image"]
    true_caption = example["caption"]

    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs)

    generated_caption = processor.decode(output[0], skip_special_tokens=True)

    new_results.append({
        "Index": idx,
        "Generated Caption": generated_caption,
        "True Caption": true_caption
    })

# ✅ ذخیره نتایج جدید
with open('new_results.json', 'w', encoding='utf-8') as f:
    json.dump(new_results, f, ensure_ascii=False, indent=4)

print(f"✅ Saved new results ({len(new_results)} items) to new_results.json")

🚀 Continuing caption generation from index 6125...


100%|████████████████████████████████████████████████████████████████████████████| 3802/3802 [2:58:51<00:00,  2.82s/it]

✅ Saved new results (3802 items) to new_results.json





In [15]:
# 📚 Load both partial and new results
with open('partial_results.json', 'r', encoding='utf-8') as f:
    partial = json.load(f)

with open('new_results.json', 'r', encoding='utf-8') as f:
    new = json.load(f)

# 🔥 Combine them
final_results = partial + new

# 📂 Save final combined results
with open('final_results.json', 'w', encoding='utf-8') as f:
    json.dump(final_results, f, ensure_ascii=False, indent=4)

print(f"✅ Combined results saved to final_results.json (Total: {len(final_results)} samples)")

✅ Combined results saved to final_results.json (Total: 9927 samples)


In [31]:
import json

# Load the final results
with open("final_results.json", "r", encoding="utf-8") as f:
    final_results = json.load(f)

# Separate generated and true captions
generated_captions = [item["Generated Caption"] for item in final_results]
true_captions = [item["True Caption"] for item in final_results]

print(f"✅ Loaded {len(generated_captions)} generated captions and {len(true_captions)} true captions.")

✅ Loaded 9927 generated captions and 9927 true captions.


In [32]:
import evaluate
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")

# Evaluate BERTScore

print("\n🚀 Calculating BERTScore...")
bert_results = bertscore.compute(predictions=generated_captions, references=true_captions, lang="en")

# Evaluate ROUGE
print("\n🚀 Calculating ROUGE...")
rouge_results = rouge.compute(predictions=generated_captions, references=true_captions)

# 📋 Display Results
print("\n📊 Evaluation Results:")
print(f"Average BERTScore (F1): {sum(bert_results['f1'])/len(bert_results['f1']):.4f}")
print(f"ROUGE-1 F1 Score: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2 F1 Score: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L F1 Score: {rouge_results['rougeL']:.4f}")

print("\n✅ Evaluation Completed!")


🚀 Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Calculating ROUGE...

📊 Evaluation Results:
Average BERTScore (F1): 0.8383
ROUGE-1 F1 Score: 0.1685
ROUGE-2 F1 Score: 0.0465
ROUGE-L F1 Score: 0.1516

✅ Evaluation Completed!


In [34]:
import numpy as np

# Save evaluation results
average_bert_f1 = np.mean(bert_results["f1"])  # دستی میانگین میگیریم

results_text = f"""
📊 Evaluation Results:

Average BERTScore (F1): {average_bert_f1:.4f}

ROUGE-1 F1 Score: {rouge_results['rouge1']:.4f}
ROUGE-2 F1 Score: {rouge_results['rouge2']:.4f}
ROUGE-L F1 Score: {rouge_results['rougeL']:.4f}
"""

with open("evaluation_results.txt", "w", encoding="utf-8") as f:
    f.write(results_text)

print("✅ Results saved to evaluation_results.txt")

✅ Results saved to evaluation_results.txt


In [39]:
from transformers import AutoTokenizer
import evaluate

# Load the tokenizer for Bio_ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Load MedBERTScore
medbertscore = evaluate.load("bertscore")

# 🔥 Improved smart_truncate based on token limits
def smart_truncate(text, max_tokens=512):
    tokens = tokenizer(text, truncation=True, max_length=max_tokens, return_tensors="pt")
    decoded_text = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
    return decoded_text

# Apply to captions
generated_captions_truncated = [smart_truncate(caption) for caption in generated_captions]
true_captions_truncated = [smart_truncate(caption) for caption in true_captions]

# Now evaluate
print("\n🚀 Calculating MedBERTScore...")
medbert_results = medbertscore.compute(
    predictions=generated_captions_truncated,
    references=true_captions_truncated,
    model_type="emilyalsentzer/Bio_ClinicalBERT",
    num_layers=12,
    lang="en",
    batch_size=8
)

# Show results
print("\n📊 MedBERTScore Evaluation Results:")
print(f"Average Precision: {sum(medbert_results['precision'])/len(medbert_results['precision']):.4f}")
print(f"Average Recall: {sum(medbert_results['recall'])/len(medbert_results['recall']):.4f}")
print(f"Average F1 Score: {sum(medbert_results['f1'])/len(medbert_results['f1']):.4f}")

# Save to file
with open("medbertscore_results.txt", "w", encoding="utf-8") as f:
    f.write(f"""
📊 MedBERTScore Evaluation Results:

Average Precision: {sum(medbert_results['precision'])/len(medbert_results['precision']):.4f}
Average Recall: {sum(medbert_results['recall'])/len(medbert_results['recall']):.4f}
Average F1 Score: {sum(medbert_results['f1'])/len(medbert_results['f1']):.4f}
""")

print("\n✅ MedBERTScore Results Saved!")


🚀 Calculating MedBERTScore...

📊 MedBERTScore Evaluation Results:
Average Precision: 0.7236
Average Recall: 0.6813
Average F1 Score: 0.7012

✅ MedBERTScore Results Saved!
