In [None]:
import sys
!{sys.executable} -m pip install transformers accelerate matplotlib

## Load model

In [None]:
from huggingface_hub import login
login("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")

# import os
# os.environ["HF_TOKEN"] = "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"

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

tokenizer = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto"
)
model.eval()

## Prepare dataset

In [None]:
!mkdir -p coco/annotations
!mkdir -p coco/val2014

In [None]:
print("Downloading annotations...")
!wget -c http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip -q annotations_trainval2014.zip -d coco/
!rm annotations_trainval2014.zip

In [None]:
print("Downloading images...")
!wget -c http://images.cocodataset.org/zips/val2014.zip
!unzip -q val2014.zip -d coco/
!rm val2014.zip

print("Done! Dữ liệu đã sẵn sàng tại thư mục /content/coco")

In [None]:
import json
import random

ann_path = '/content/coco/annotations/instances_val2014.json'
img_dir = '/content/coco/val2014'

with open(ann_path, 'r') as f:
    coco_data = json.load(f)

dataset_for_inference = []

# Lấy 500 ảnh ngẫu nhiên
random_images = random.sample(coco_data['images'], 500)

for img_info in random_images:
    dataset_for_inference.append({
        "image_id": img_info['id'],
        "image_path": f"{img_dir}/{img_info['file_name']}",
        "prompt": "Describe the image."
    })

print(f"Đã chuẩn bị {len(dataset_for_inference)} ảnh ngẫu nhiên để test.")

## Infer

In [None]:
import torch

def infer_one(model, processor, sample, device="cuda"):

    question = sample['question']
    image = sample['image']

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {
                    "type": "text",
                    "text": (
                        f"Question: {question}"
                    )
                }
            ]
        }
    ]

    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = processor(
        images=image,
        text=text,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=False
        )

    output_text = processor.decode(
        output_ids[0],
        skip_special_tokens=True
    ).lower()

    # if re.search(r"\byes\b", output_text):
    #     answer = "yes"
    # elif re.search(r"\bno\b", output_text):
    #     answer = "no"
    # else:
    #     answer = "no"

    return {
        "question": question,
        "answer": output_text
    }

In [None]:
import json
import os
from tqdm import tqdm

def infer_all(model, dataset_list, save_path):
    """
    dataset_list: Danh sách các dict, mỗi dict có:
        {"image_id": int, "image_path": str, "prompt": str}
    """
    all_results = []

    # Nếu file đã tồn tại, có thể load lên để chạy tiếp (resume) nếu muốn
    if os.path.exists(save_path):
        with open(save_path, 'r') as f:
            all_results = json.load(f)
        print(f"Resuming from {len(all_results)} images...")

    # Lọc ra những ảnh chưa chạy (nếu bạn muốn hỗ trợ resume)
    processed_ids = {res['image_id'] for res in all_results}

    for item in tqdm(dataset_list):
        img_id = item['image_id']
        if img_id in processed_ids:
            continue

        img_path = item['image_path']
        prompt = item['prompt']

        try:
            # # Gọi hàm generate từ class M3ID_Paper của bạn
            # output_caption = m3id_engine.generate(
            #     prompt=prompt,
            #     image_path=img_path,
            #     max_new_tokens=100, # Caption thường ngắn, 100 là đủ
            #     temperature=0.2,
            #     verbose=False
            # )
            image = tokenizer.decode_image(img_path)
            output_caption = infer_one(
                model,
                tokenizer,
                {
                    "question": prompt,
                    "image": image,
                },
                device="cuda"
            )['answer']

            # Lưu kết quả
            res_entry = {
                "image_id": img_id,
                "caption": output_caption,
                "prompt": prompt # Lưu lại prompt để đối chiếu nếu cần
            }
            all_results.append(res_entry)

            # Lưu định kỳ mỗi 50 ảnh để tránh mất dữ liệu nếu crash
            if len(all_results) % 50 == 0:
                with open(save_path, 'w', encoding='utf-8') as f:
                    json.dump(all_results, f, indent=4)

        except Exception as e:
            print(f"Lỗi tại ảnh {img_id}: {e}")
            continue

    # Lưu file cuối cùng
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, indent=4)

    print(f"Đã lưu toàn bộ {len(all_results)} kết quả vào {save_path}")
    return all_results

# --- CÁCH SỬ DỤNG ---
# Giả sử bạn có list ảnh từ MSCOCO
# my_test_data = [
#    {"image_id": 391895, "image_path": "val2014/COCO_val2014_000000391895.jpg", "prompt": "Please describe this image in detail."},
#    ...
# ]

# run_hallucination_benchmark(m3id, my_test_data, "m3id_results_coco.json")

In [None]:
save_file = "m3id_results_coco_val.json"

results = infer_all(
    model=model,
    dataset_list=dataset_for_inference,
    save_path=save_file
)