In [1]:
pip install evaluate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
from datasets import load_dataset
import tqdm
import evaluate

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

dataset = load_dataset("mdwiratathya/SLAKE-vqa-english")
eval_split = dataset["validation"]

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

predictions = []
references = []
exact_match = 0

for sample in tqdm.tqdm(eval_split, desc="Evaluating"):
    image = sample["image"]
    question = sample["question"]
    ref_answer = sample["answer"]

    inputs = processor(image, question, return_tensors="pt").to(device)

    out = model.generate(**inputs, max_new_tokens=20)
    pred_answer = processor.decode(out[0], skip_special_tokens=True).strip()

    predictions.append(pred_answer)
    references.append(ref_answer)

    if pred_answer.lower() == ref_answer.lower():
        exact_match += 1

accuracy = exact_match / len(eval_split)
bleu_result = bleu_metric.compute(predictions=predictions, references=references)
rouge_result = rouge_metric.compute(predictions=predictions, references=references)

print("Evaluation Results on PathVQA (Validation):")
print(f"Accuracy (Exact Match): {accuracy:.4f}")
print("BLEU:", bleu_result)
print("ROUGE:", rouge_result)


README.md:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/31.1M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.34M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4919 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1053 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1061 [00:00<?, ? examples/s]

Evaluating: 100%|██████████████████████████████████████████████████████████████████| 1053/1053 [00:58<00:00, 17.91it/s]


Evaluation Results on PathVQA (Validation):
Accuracy (Exact Match): 0.2555
BLEU: {'bleu': 0.0, 'precisions': [0.02286198137171888, 0.0, 0.0, 0.0], 'brevity_penalty': 0.7001381814219072, 'length_ratio': 0.737203495630462, 'translation_length': 1181, 'reference_length': 1602}
ROUGE: {'rouge1': 0.2687084520417854, 'rouge2': 0.006647673314339981, 'rougeL': 0.2682177904400127, 'rougeLsum': 0.26858182969294075}
