In [6]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

In [7]:
import random
import numpy as np
import torch

def set_seed(seed: int = 42):
    """
    Set random seed for reproducibility across Python, NumPy, and PyTorch.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # For deterministic behavior (slower but fully reproducible)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [8]:
import os
aokvqa_dir = os.getenv('AOKVQA_DIR', r"C:\workspace\misc\5980\aokvqa")
coco_filtered_dir = os.getenv('COCO_FILTERED_DIR', r"C:\workspace\misc\5980\coco_filtered")

In [9]:
from load_aokvqa import load_aokvqa, get_coco_path

val_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'val')
train_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'train')
test_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'test')

In [10]:
print(f"Train dataset size: {len(train_aokvqa_dataset)}")
print(f"Validation dataset size: {len(val_aokvqa_dataset)}")
print(f"Test dataset size: {len(test_aokvqa_dataset)}")

Train dataset size: 17056
Validation dataset size: 1145
Test dataset size: 6702


In [11]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cpu


single prediction

In [7]:
model_name = "HuggingFaceTB/SmolVLM-256M-Instruct"
# model_name = "HuggingFaceTB/SmolVLM-256M-Base"

# Initialize processor and model
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 1807.89it/s]
`torch_dtype` is deprecated! Use `dtype` instead!


In [None]:
def get_prediction(data_point, coco_dir, split):
    print(data_point['question_id'])
    # 22MexNkBPpdZGX6sxbxVBH

    # ./datasets/coco/val2017/000000299207.jpg

    print(data_point['question'])
    print(data_point['choices'])
    # What is the man by the bags awaiting?
    # ['skateboarder', 'train', 'delivery', 'cab']

    correct_choice = data_point['choices'][ data_point['correct_choice_idx'] ]
    print(correct_choice)
    # Corrrect: cab

    print(data_point['rationales'][0])


    question = data_point['question']
    choices = data_point['choices']

    image_path = get_coco_path(split, data_point['image_id'], coco_dir)
    # image = load_image(image_path)
    image = Image.open(image_path).convert("RGB")

    # mcq_question = f"Question: {question}. These are the choices: {choices}. Answer:"
    # print("Question Template:", mcq_question)

    direct_question = f"Question: {question}. Give one word answer. Answer:"
    # Create input messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": direct_question}
            ]
        },
    ]

    # Prepare inputs
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = inputs.to(DEVICE)

    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=500)
    generated_texts = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
    )
    return data_point, generated_texts[0]


In [27]:
data_point = val_aokvqa_dataset[0]

data_point, pred = get_prediction(data_point, coco_filtered_dir, 'val')

22jbM6gDxdaMaunuzgrsBB
What is in the motorcyclist's mouth?
['toothpick', 'food', 'popsicle stick', 'cigarette']
cigarette
He's smoking while riding.


In [31]:
pred

"User:\n\n\n\n\nQuestion: What is in the motorcyclist's mouth?. Give one word answer. Answer:\nAssistant: Bottle."

In [29]:
predictions = []
for data_point in val_aokvqa_dataset[:2]:
    data_point, pred = get_prediction(data_point, coco_filtered_dir, 'val')
    # print(f"Pred: {pred}")
    print("----")
    predictions.append((data_point, pred))

22jbM6gDxdaMaunuzgrsBB
What is in the motorcyclist's mouth?
['toothpick', 'food', 'popsicle stick', 'cigarette']
cigarette
He's smoking while riding.
----
2Aq5RiEn7eyfWjEbpuYT2o
Which number birthday is probably being celebrated?
['one', 'ten', 'nine', 'thirty']
thirty
There is a birthday cake on the table with the number 30 written in icing.
----


In [None]:
image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
# image2 = load_image("https://upload.wikimedia.org/wikipedia/commons/4/47/New_york_times_square-terabass.jpg")

# Suppose you have a list of (messages, image) pairs
batch_messages = [
    [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Can you describe this image?"}
            ]
        }
    ]
    for _ in range(2)
    # Add more message dicts as needed
]
batch_images = [
    image,  # Use your loaded images here
    image
]

# Prepare prompts for each message
batch_prompts = [processor.apply_chat_template(msg, add_generation_prompt=True) for msg in batch_messages]

# Prepare inputs for the batch
batch_inputs = processor(
    text=batch_prompts,
    images=batch_images,
    return_tensors="pt",
    padding=True
)
batch_inputs = batch_inputs.to(DEVICE)

batch_generated_ids = model.generate(**batch_inputs, max_new_tokens=500)
batch_generated_texts = processor.batch_decode(
    batch_generated_ids,
    skip_special_tokens=True,
)

Batch

In [12]:
model_name = "HuggingFaceTB/SmolVLM-256M-Instruct"

# Initialize processor and model
processor = AutoProcessor.from_pretrained(model_name)

# Force left padding for decoder-only generation
if hasattr(processor, "tokenizer"):
    processor.tokenizer.padding_side = "left"
    if processor.tokenizer.pad_token is None:
        processor.tokenizer.pad_token = processor.tokenizer.eos_token

model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)

Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]
`torch_dtype` is deprecated! Use `dtype` instead!


In [16]:
def get_batch_predictions(data_points, coco_dir, split):
    """
    Generate predictions for a batch of data points.

    Args:
        data_points (list): List of data point dicts.
        coco_dir (str): Directory containing COCO images.
        split (str): Dataset split ('train', 'val', 'test').

    Returns:
        List of (data_point, prediction) tuples.
    """
    # Prepare images and questions
    images = []
    batch_messages = []
    for data_point in data_points:
        question = data_point['question']
        choices = data_point['choices']
        image_path = get_coco_path(split, data_point['image_id'], coco_dir)
        image = Image.open(image_path).convert("RGB")
        images.append(image)
        question_template = f"Question: {question}. These are the choices: {choices}. Answer:"
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": question_template}
                ]
            }
        ]
        batch_messages.append(messages)

    # Prepare prompts for each message
    batch_prompts = [processor.apply_chat_template(msg, add_generation_prompt=True) for msg in batch_messages]

    # Prepare inputs for the batch
    batch_inputs = processor(
        text=batch_prompts,
        images=images,
        return_tensors="pt",
        padding=True
    )
    batch_inputs = batch_inputs.to(DEVICE)

    # Generate outputs
    batch_generated_ids = model.generate(**batch_inputs)
    batch_generated_texts = processor.batch_decode(
        batch_generated_ids,
        skip_special_tokens=True,
    )

    return list(zip(data_points, batch_generated_texts))

In [17]:
batch_size = 2
predictions = []
for i in range(0, len(val_aokvqa_dataset[:10]), batch_size):  # adjust [:20] as needed
    batch = val_aokvqa_dataset[i:i+batch_size]
    batch_results = get_batch_predictions(batch, coco_filtered_dir, 'val')
    for item in batch_results:
        print("----")
        predictions.append(item)
    break

----
----


In [20]:
len(predictions)

2

In [21]:
predictions

[({'split': 'val',
   'image_id': 461751,
   'question_id': '22jbM6gDxdaMaunuzgrsBB',
   'question': "What is in the motorcyclist's mouth?",
   'choices': ['toothpick', 'food', 'popsicle stick', 'cigarette'],
   'correct_choice_idx': 3,
   'direct_answers': ['cigarette',
    'cigarette',
    'cigarette',
    'cigarette',
    'cigarette',
    'cigarette',
    'cigarette',
    'cigarette',
    'cigarette',
    'cigarette'],
   'difficult_direct_answer': False,
   'rationales': ["He's smoking while riding.",
    'The motorcyclist has a lit cigarette in his mouth while he rides on the street.',
    'The man is smoking.']},
  "User:\n\n\n\n\nQuestion: What is in the motorcyclist's mouth?. These are the choices: ['toothpick', 'food', 'popsicle stick', 'cigarette']. Answer:\nAssistant: Popsicle stick."),
 ({'split': 'val',
   'image_id': 377368,
   'question_id': '2Aq5RiEn7eyfWjEbpuYT2o',
   'question': 'Which number birthday is probably being celebrated?',
   'choices': ['one', 'ten', 'nine'