In [1]:
import sys
!{sys.executable} -m pip install transformers accelerate matplotlib

Collecting matplotlib
  Using cached matplotlib-3.10.8-cp310-cp310-win_amd64.whl.metadata (52 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.61.1-cp310-cp310-win_amd64.whl.metadata (116 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.9-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.3.2-py3-none-any.whl.metadata (5.8 kB)
Using cached matplotlib-3.10.8-cp310-cp310-win_amd64.whl (8.1 MB)
Using cached contourpy-1.3.2-cp310-cp310-win_amd64.whl (221 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Using cached fonttools-4.61.1-cp310-cp310-win_amd64.whl (1.6 MB)
Using cached kiwisolver-1.4.9-cp310-cp310-win_amd64.whl (73 kB)
Downloadin

In [None]:
from datasets import load_dataset

pope_dataset = load_dataset("lmms-lab/POPE", cache_dir="~/POPE")

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    fast_inference=True,
    device_map="auto"
)
model.eval()

In [None]:
import torch
import re

def infer_one(model, processor, sample, device="cuda"):

    question = sample['question']
    image = sample['image']

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {
                    "type": "text",
                    "text": (
                        "Answer the following question using only one word: yes or no.\n"
                        f"Question: {question}"
                    )
                }
            ]
        }
    ]

    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = processor(
        images=image,
        text=text,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False
        )
    generated_ids = output_ids[:, inputs["input_ids"].shape[1]:]

    output_text = processor.decode(
        generated_ids,
        skip_special_tokens=True
    )[0].lower()

    if re.search(r"\byes\b", output_text):
        answer = "yes"
    elif re.search(r"\bno\b", output_text):
        answer = "no"
    else:
        answer = "no"

    return {
        "question": question,
        "answer": answer
    }

In [None]:
print(pope_dataset['test'][0])

In [None]:
response = infer_one(model, processor, pope_dataset['test'][0])
print("Câu hỏi:", response['question'])
print("Trả lời:", response['answer'])

In [None]:
import json
from tqdm import tqdm
import torch

def infer_pope_to_jsonl(
    model,
    processor,
    dataset,
    output_path,
    device="cuda"
):
    """
    dataset: split 'test' của POPE
    output_path: ví dụ 'pope_qwenvl_predictions.jsonl'
    """

    model.eval()
    torch.set_grad_enabled(False)

    with open(output_path, "w", encoding="utf-8") as f:
        for sample in tqdm(dataset, desc="Infer POPE"):
            result = infer_one(
                model=model,
                processor=processor,
                sample=sample,
                device=device
            )

            # ghi đúng 1 json / 1 dòng
            f.write(json.dumps(result, ensure_ascii=False) + "\n")


In [None]:
from datasets import load_dataset

# load POPE
dataset = load_dataset("lmms-lab/POPE")
test_set = dataset["test"]

# infer
infer_pope_to_jsonl(
    model=model,
    processor=processor,
    dataset=test_set,
    output_path="pope_qwenvl.jsonl",
    device="cuda"
)

## Benchmark 

In [None]:
# def load_jsonl(path):
#     records = []
#     with open(path, "r", encoding="utf-8") as f:
#         for line in f:
#             line = line.strip()
#             if not line:
#                 continue
#             records.append(json.loads(line))
#     return records

In [None]:
# import json

# ans_file = '/kaggle/input/inference-output/pope_qwenvl.jsonl'
# label_list = list(pope_dataset['test']['answer'])

# #your code here
# answers = []
# with open(ans_file, 'r', encoding='utf-8') as f:
#     for line in f:
#         if line.strip():
#             answers.append(json.loads(line))
            
# for answer in answers:
#     text = answer['answer']

#     # Only keep the first sentence
#     if text.find('.') != -1:
#         text = text.split('.')[0]

#     text = text.replace(',', '')
#     words = text.split(' ')
#     if 'No' in words or 'not' in words or 'no' in words:
#         answer['answer'] = 'no'
#     else:
#         answer['answer'] = 'yes'

# for i in range(len(label_list)):
#     if label_list[i] == 'no':
#         label_list[i] = 0
#     else:
#         label_list[i] = 1

# pred_list = []
# for answer in answers:
#     if answer['answer'] == 'no':
#         pred_list.append(0)
#     else:
#         pred_list.append(1)

# pos = 1
# neg = 0
# yes_ratio = pred_list.count(1) / len(pred_list)

# TP, TN, FP, FN = 0, 0, 0, 0
# for pred, label in zip(pred_list, label_list):
#     if pred == pos and label == pos:
#         TP += 1
#     elif pred == pos and label == neg:
#         FP += 1
#     elif pred == neg and label == neg:
#         TN += 1
#     elif pred == neg and label == pos:
#         FN += 1

# print('TP\tFP\tTN\tFN\t')
# print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))

# precision = float(TP) / float(TP + FP)
# recall = float(TP) / float(TP + FN)
# f1 = 2*precision*recall / (precision + recall)
# acc = (TP + TN) / (TP + TN + FP + FN)
# print('Accuracy: {}'.format(acc))
# print('Precision: {}'.format(precision))
# print('Recall: {}'.format(recall))
# print('F1 score: {}'.format(f1))
# print('Yes ratio: {}'.format(yes_ratio))