In [13]:
!pip install transformers datasets accelerate bitsandbytes --quiet


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from datasets import load_dataset
import os
import requests
from PIL import Image
from transformers import BlipForQuestionAnswering, AutoProcessor
import torch
from transformers import LlavaForConditionalGeneration, AutoProcessor
import re
from PIL import Image
import torch

In [14]:
dataset = load_dataset("HuggingFaceM4/VQAv2", split="validation[:100]")

Repo card metadata block was not found. Setting CardData to empty.


In [15]:
def download_coco_image(image_id, save_dir="images/val2014"):
    base_url = "http://images.cocodataset.org/val2014/"
    filename = f"COCO_val2014_{int(image_id):012d}.jpg"
    url = base_url + filename
    os.makedirs(save_dir, exist_ok=True)
    path = os.path.join(save_dir, filename)
    if not os.path.exists(path):
        response = requests.get(url)
        response.raise_for_status()
        with open(path, "wb") as f:
            f.write(response.content)
    return path

# Скачиваем изображения по image_id
image_paths = [download_coco_image(item["image_id"]) for item in dataset]

print("Скачанные изображения:", image_paths[:3])  # Показать первые 3 пути

Скачанные изображения: ['images/val2014/COCO_val2014_000000262148.jpg', 'images/val2014/COCO_val2014_000000262148.jpg', 'images/val2014/COCO_val2014_000000262148.jpg']


In [16]:
processor_blip = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
model_blip = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

def run_blip(image_path, question):
    image = Image.open(image_path).convert("RGB")
    inputs = processor_blip(image, question, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model_blip.generate(**inputs)
    answer = processor_blip.decode(output[0], skip_special_tokens=True)
    return answer

In [17]:
processor_llava = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model_llava = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", device_map="auto", torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
def run_llava(image_path, question):
    image = Image.open(image_path).convert("RGB")
    prompt = f"<image>\nQuestion: {question}\nAnswer:"  # ВАЖНО: добавлен <image>
    
    inputs = processor_llava(images=image, text=prompt, return_tensors="pt").to("cuda")
    generate_ids = model_llava.generate(**inputs, max_new_tokens=50)
    
    answer = processor_llava.batch_decode(generate_ids, skip_special_tokens=True)[0].strip()
    return answer

In [19]:
def normalize(text):
    return re.sub(r'\W+', '', text.strip().lower())

def relaxed_match(pred, refs):
    pred_norm = normalize(pred)
    return int(any(normalize(ref) in pred_norm for ref in refs))

correct_blip = 0
correct_llava = 0

for i, item in enumerate(dataset):
    img_path = image_paths[i]
    question = item["question"]
    refs = [ans["answer"] for ans in item["answers"]]

    answer_blip = run_blip(img_path, question)
    answer_llava = run_llava(img_path, question)

    em_blip = relaxed_match(answer_blip, refs)
    em_llava = relaxed_match(answer_llava, refs)

    correct_blip += em_blip
    correct_llava += em_llava

    print(f"Q{i+1}: {question}")
    print(f"GT: {refs}")
    print(f"BLIP: {answer_blip} - {'✔' if em_blip else '✘'}")
    print(f"LLaVA: {answer_llava} - {'✔' if em_llava else '✘'}")
    print("-" * 40)

print(f"\nAccuracy BLIP: {correct_blip}/{len(dataset)}")
print(f"Accuracy LLaVA: {correct_llava}/{len(dataset)}")


Q1: Where is he looking?
GT: ['down', 'down', 'at table', 'skateboard', 'down', 'table', 'down', 'down', 'down', 'down']
BLIP: down - ✔
LLaVA: Question: Where is he looking?
Answer: Down - ✔
----------------------------------------
Q2: What are the people in the background doing?
GT: ['spectating', 'watching', 'watching', 'watching', 'watching', 'watching', 'watching', 'watching', 'watching', 'watching']
BLIP: watching - ✔
LLaVA: Question: What are the people in the background doing?
Answer: They are watching the skateboarder perform tricks. - ✔
----------------------------------------
Q3: What is he on top of?
GT: ['table', 'table', 'table', 'picnic table', 'picnic table', 'picnic table', 'picnic table', 'picnic table', 'skateboard', 'picnic table']
BLIP: picnic table - ✔
LLaVA: Question: What is he on top of?
Answer: Skateboard - ✔
----------------------------------------
Q4: What website copyrighted the picture?
GT: ['foodiebakercom', 'foodiebakercom', 'foodiebaker', 'foodiebakercom