In [3]:
# Test Fine-Tuned BLIP Model and Save Results

In [4]:
# Step 1: Import Required Libraries
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import load_dataset
import matplotlib.pyplot as plt

In [5]:
# Step 2: Load Fine-Tuned Model
print("Loading fine-tuned BLIP model...")
processor = BlipProcessor.from_pretrained("./blip-finetuned")
model = BlipForConditionalGeneration.from_pretrained("./blip-finetuned")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading fine-tuned BLIP model...


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [6]:
# Step 3: Load Test Dataset
print("Loading test dataset...")
dataset = load_dataset("eltorio/ROCOv2-radiology")
test_data = dataset["test"]

Loading test dataset...


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/27 [00:00<?, ?it/s]

In [7]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Step 4: Generate Captions for Test Images Without Displaying Images
from tqdm import tqdm
model.eval()
results = []

print("Generating captions for test images...")
for idx, example in enumerate(tqdm(test_data)):
    image = example["image"]
    true_caption = example["caption"]

    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Generate caption
    with torch.no_grad():
        output = model.generate(**inputs)
    generated_caption = processor.decode(output[0], skip_special_tokens=True)

    # Save result
    results.append({
        "Index": idx,
        "Generated Caption": generated_caption,
        "True Caption": true_caption
    })

Generating captions for test images...


 62%|█████████████████████████████████████████████▋                            | 6125/9927 [5:13:29<3:14:35,  3.07s/it]


RuntimeError: [enforce fail at alloc_cpu.cpp:116] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1772544 bytes.

In [9]:
import json

# Save partial results now
with open("partial_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("✅ Saved current results into partial_results.json")

✅ Saved current results into partial_results.json


In [29]:
import json
from tqdm import tqdm
import torch

# فرض بر این است که:
# - model و processor و device قبلا آماده شدن
# - test_data موجود است

# 🧠 تعداد نمونه‌های قبلی پردازش شده رو بخون
with open('partial_results.json', 'r', encoding='utf-8') as f:
    previous_results = json.load(f)

start_idx = len(previous_results)

print(f"🚀 Continuing caption generation from index {start_idx}...")

# فقط داده‌های باقیمانده
test_data_remaining = test_data[start_idx:]

new_results = []

model.eval()
for idx_offset, example in enumerate(tqdm(test_data_remaining)):
    idx = start_idx + idx_offset  # خودمون idx واقعی حساب می‌کنیم
    image = example["image"]
    true_caption = example["caption"]

    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs)

    generated_caption = processor.decode(output[0], skip_special_tokens=True)

    new_results.append({
        "Index": idx,
        "Generated Caption": generated_caption,
        "True Caption": true_caption
    })

# ✅ ذخیره نتایج جدید
with open('new_results.json', 'w', encoding='utf-8') as f:
    json.dump(new_results, f, ensure_ascii=False, indent=4)

print(f"✅ Saved new results ({len(new_results)} items) to new_results.json")

🚀 Continuing caption generation from index 6125...


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]


TypeError: string indices must be integers, not 'str'

In [None]:
# 📚 Load both partial and new results
with open('partial_results.json', 'r', encoding='utf-8') as f:
    partial = json.load(f)

with open('new_results.json', 'r', encoding='utf-8') as f:
    new = json.load(f)

# 🔥 Combine them
final_results = partial + new

# 📂 Save final combined results
with open('final_results.json', 'w', encoding='utf-8') as f:
    json.dump(final_results, f, ensure_ascii=False, indent=4)

print(f"✅ Combined results saved to final_results.json (Total: {len(final_results)} samples)")

In [10]:
# Step 5: Save Results to File
output_path = "full_test_results.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for result in results:
        f.write(f"Image {result['Index']}\n")
        f.write(f"Generated Caption: {result['Generated Caption']}\n")
        f.write(f"True Caption: {result['True Caption']}\n")
        f.write("-" * 50 + "\n")

print(f"Testing completed! All results saved to {output_path}")


Testing completed! All results saved to full_test_results.txt


In [28]:
import json

# Load the final results
with open("final_results.json", "r", encoding="utf-8") as f:
    final_results = json.load(f)

# Separate generated and true captions
generated_captions = [item["Generated Caption"] for item in final_results]
true_captions = [item["True Caption"] for item in final_results]

print(f"✅ Loaded {len(generated_captions)} generated captions and {len(true_captions)} true captions.")

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 3003: character maps to <undefined>

In [27]:
import evaluate
bertscore = evaluate.load("bertscore")
rouge = evaluate.load("rouge")

# Evaluate BERTScore

print("\n🚀 Calculating BERTScore...")
bert_results = bertscore.compute(predictions=generated_captions, references=true_captions, lang="en")

# Evaluate ROUGE
print("\n🚀 Calculating ROUGE...")
rouge_results = rouge.compute(predictions=generated_captions, references=true_captions)

# 📋 Display Results
print("\n📊 Evaluation Results:")
print(f"Average BERTScore (F1): {sum(bert_results['f1'])/len(bert_results['f1']):.4f}")
print(f"ROUGE-1 F1 Score: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2 F1 Score: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L F1 Score: {rouge_results['rougeL']:.4f}")

print("\n✅ Evaluation Completed!")


🚀 Calculating BERTScore...


NameError: name 'generated_captions' is not defined

In [18]:
pip install evaluate

Collecting evaluateNote: you may need to restart the kernel to use updated packages.

  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [22]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [23]:
!pip install bert_score



In [25]:
pip install rouge_score absl-py

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Downloading absl_py-2.2.2-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (pyproject.toml): started
  Building wheel for rouge_score (pyproject.toml): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=25026 sha256=14546d46841e90512e7c113c58a9bb987bf6ef2c5d1eaff0a2991746c0fdc96d
  Stored in directory: c:\users\saras\appdata\local\pip\cache\wheels\44\af\da\5ffc433e2786f0b1a9c6f458d5fb8f611d8eb3

In [26]:
!pip install rouge_score absl-py

