In [1]:
pip install evaluate

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install open_clip_torch==2.23.0 transformers==4.35.2 datasets tqdm evaluate

Collecting open_clip_torch==2.23.0
  Downloading open_clip_torch-2.23.0-py3-none-any.whl.metadata (30 kB)
Collecting transformers==4.35.2
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting ftfy (from open_clip_torch==2.23.0)
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting timm (from open_clip_torch==2.23.0)
  Downloading timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2)
  Downloading tokenizers-0.15.2-cp312-none-win_amd64.whl.metadata (6.8 kB)
Downloading open_clip_torch-2.23.0-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------- ----------- 1.0/1.5 MB 7.1 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.9 MB/s eta 0:00:00
Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
   ---------------------------------------- 0.0/7.9 MB ? eta -:--:--
   ----- -----------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llava-med 1.5.0 requires backoff, which is not installed.
llava-med 1.5.0 requires fastapi, which is not installed.
llava-med 1.5.0 requires gradio==3.35.2, which is not installed.
llava-med 1.5.0 requires gradio_client==0.2.9, which is not installed.
llava-med 1.5.0 requires markdown2[all], which is not installed.
llava-med 1.5.0 requires openai==1.12.0, which is not installed.
llava-med 1.5.0 requires shortuuid, which is not installed.
llava-med 1.5.0 requires tiktoken, which is not installed.
llava-med 1.5.0 requires uvicorn, which is not installed.
llava-med 1.5.0 requires accelerate==0.21.0, but you have accelerate 1.3.0 which is incompatible.
llava-med 1.5.0 requires bitsandbytes==0.41.0, but you have bitsandbytes 0.45.3 which is incompatible.
llava-med 1.5.0 requires einops==0.6.1, but you have einops 0.8.1

In [1]:
import torch
from open_clip import create_model_from_pretrained, get_tokenizer
from PIL import Image
from datasets import load_dataset
import tqdm
import evaluate


model, preprocess = create_model_from_pretrained(
    "hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"
)
tokenizer = get_tokenizer("hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

dataset = load_dataset("mdwiratathya/SLAKE-vqa-english")
eval_split = dataset["validation"]


all_answers = list({ s["answer"] for s in eval_split })



ans_ids = tokenizer(all_answers).to(device)  # [N, L]
pad_id = tokenizer.tokenizer.pad_token_id
ans_mask = ans_ids.ne(pad_id)

with torch.no_grad():
    answer_feats = model.encode_text(ans_ids)
    answer_feats = answer_feats / answer_feats.norm(dim=-1, keepdim=True)


bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
predictions, references = [], []
exact_match = 0

for sample in tqdm.tqdm(eval_split):
    img = sample["image"]
    ref = sample["answer"]

    img_in = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        img_feat = model.encode_image(img_in)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)

    scale = model.logit_scale.exp()
    logits = img_feat @ answer_feats.T * scale
    idx = logits.argmax(-1).item()
    pred = all_answers[idx]
    
    predictions.append(pred)
    references.append(ref)
    if pred.lower() == ref.lower():
        exact_match += 1

accuracy = exact_match / len(eval_split)
bleu = bleu_metric.compute(predictions=predictions, references=references)
rouge = rouge_metric.compute(predictions=predictions, references=references)

print("=== BioMedCLIP Zero-Shot Retrieval VQA on SLAKE ===")
print(f"Exact Match Accuracy: {accuracy:.4f}")
print("BLEU:", bleu)
print("ROUGE:", rouge)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
100%|█████████████████████████████████████████████████████████████████████████████| 1053/1053 [00:10<00:00, 103.32it/s]


=== BioMedCLIP Zero-Shot Retrieval VQA on SLAKE ===
Exact Match Accuracy: 0.0313
BLEU: {'bleu': 0.030762266226803628, 'precisions': [0.06602336211274759, 0.025109170305676855, 0.02147239263803681, 0.025157232704402517], 'brevity_penalty': 1.0, 'length_ratio': 1.2290886392009988, 'translation_length': 1969, 'reference_length': 1602}
ROUGE: {'rouge1': 0.0654685875483596, 'rouge2': 0.0143265513635884, 'rougeL': 0.06511725585799656, 'rougeLsum': 0.06543237283978023}
