In [2]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Using cached multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.2-cp311-cp311-win_amd64.whl.metadata (8.4 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Using cac

In [13]:
import json
from datasets import load_dataset

In [2]:
dataset = load_dataset("commonsense_qa")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [4]:
len(dataset['train']), len(dataset['validation']), len(dataset['test'])

(9741, 1221, 1140)

In [5]:
dataset['train'][0]

{'id': '075e483d21c29a511267ef62bedc0461',
 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'question_concept': 'punishing',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']},
 'answerKey': 'A'}

In [7]:
dataset['validation'][0]

{'id': '1afa02df02c908a558b4036e80242fac',
 'question': 'A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
 'question_concept': 'revolving door',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['bank', 'library', 'department store', 'mall', 'new york']},
 'answerKey': 'A'}

In [12]:
dataset['test'][0]

{'id': '90b30172e645ff91f7171a048582eb8b',
 'question': 'The townhouse was a hard sell for the realtor, it was right next to a high rise what?',
 'question_concept': 'townhouse',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['suburban development',
   'apartment building',
   'bus stop',
   'michigan',
   'suburbs']},
 'answerKey': ''}

In [20]:
with open("data/commonsenseqa_validation.json", "w") as fp:
    json.dump(dataset['validation'].to_list(), fp, indent=4)

smolvlm with commonsenseqa dataset

In [24]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

In [25]:
import random
import numpy as np
import torch

def set_seed(seed: int = 42):
    """
    Set random seed for reproducibility across Python, NumPy, and PyTorch.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # For deterministic behavior (slower but fully reproducible)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [26]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [27]:
# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)

Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]
`torch_dtype` is deprecated! Use `dtype` instead!


In [62]:
question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
choices = ['bank', 'library', 'department store', 'mall', 'new york']
prompt = """
Question: {question}
Choices: {choices}
Give only the single correct answer. Do not include any reasoning, explanation, or extra text. 
Answer (just one choice):
""".strip()
prompt = prompt.format(question=question, choices=choices).strip()
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": prompt}
        ]
    },
]

In [61]:
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

User: Question: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?
Choices: ['bank', 'library', 'department store', 'mall', 'new york']
Give only the single correct answer. Do not include any reasoning, explanation, or extra text. 
Answer (just one choice):
Assistant: bank.


In [22]:
dataset_path = "data/commonsenseqa_validation.json"

with open(dataset_path, "r") as fp:
    csqa_dataset = json.load(fp)

len(csqa_dataset)

1221

In [23]:
csqa_dataset[0]

{'id': '1afa02df02c908a558b4036e80242fac',
 'question': 'A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
 'question_concept': 'revolving door',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['bank', 'library', 'department store', 'mall', 'new york']},
 'answerKey': 'A'}

In [80]:
def map_answer_key_to_index(answer_key):
    """
    Map answer key letters (A, B, C, D, E) to corresponding indices (0, 1, 2, 3, 4).
    """
    return ord(answer_key.upper()) - ord('A')

def get_correct_answer_text(data_point):
    """
    Get the correct answer text from a data point using the answer key.
    """
    answer_key = data_point['answerKey']
    answer_index = map_answer_key_to_index(answer_key)
    return data_point['choices']['text'][answer_index]

# Test with the first data point
test_data_point = csqa_dataset[0]
print(f"Answer key: {test_data_point['answerKey']}")
print(f"Answer index: {map_answer_key_to_index(test_data_point['answerKey'])}")
print(f"Correct answer: {get_correct_answer_text(test_data_point)}")


Answer key: A
Answer index: 0
Correct answer: bank


In [None]:
csqa_dataset[0]['choices']

In [79]:
prompt = """
Question: {question}
Choices: {choices}
Give only the single correct answer. Do not include any reasoning, explanation, or extra text. 
Answer (just one choice):
""".strip()

def get_batch_predictions(data_points):
    """
    Generate predictions for a batch of data points.

    Args:
        data_points (list): List of data point dicts.

    Returns:
        List of (data_point, prediction) tuples.
    """
    # Prepare images and questions
    images = []
    batch_messages = []
    for data_point in data_points:
        question = data_point['question']
        choices = data_point['choices']['text']

        mcq_question = prompt.format(question=question, choices=choices).strip()

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": mcq_question}
                ]
            }
        ]
        batch_messages.append(messages)

    # Prepare prompts for each message
    batch_prompts = [processor.apply_chat_template(msg, add_generation_prompt=True) for msg in batch_messages]

    # Prepare inputs for the batch
    batch_inputs = processor(
        text=batch_prompts,
        return_tensors="pt",
        padding=True
    )
    batch_inputs = batch_inputs.to(DEVICE)

    # Generate outputs
    batch_generated_ids = model.generate(**batch_inputs)
    batch_generated_texts = processor.batch_decode(
        batch_generated_ids,
        skip_special_tokens=True,
    )

    return list(zip(data_points, batch_generated_texts))

In [75]:
def run_batch_predictions(csqa_dataset, batch_size):
    predictions = []
    for i in range(0, len(csqa_dataset), batch_size):
        batch = csqa_dataset[i:i+batch_size]
        batch_results = get_batch_predictions(batch)
        for item in batch_results:
            predictions.append(item)
        print(f"Batch number: ({(i+1)//batch_size} / {len(csqa_dataset)/batch_size}). Progress: {(i+1) / len(csqa_dataset)*100:.2f}%")
    return predictions

In [76]:
batch_size = 2
# dataset = val_aokvqa_dataset[: 150]
dataset = csqa_dataset[:5]

predictions = run_batch_predictions(dataset, batch_size)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Batch number: (0 / 2.5). Progress: 20.00%
Batch number: (1 / 2.5). Progress: 60.00%
Batch number: (2 / 2.5). Progress: 100.00%


In [77]:
predictions

[({'id': '1afa02df02c908a558b4036e80242fac',
   'question': 'A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
   'question_concept': 'revolving door',
   'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
    'text': ['bank', 'library', 'department store', 'mall', 'new york']},
   'answerKey': 'A'},
  "User: Question: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?\nChoices: ['bank', 'library', 'department store', 'mall', 'new york']\nGive only the single correct answer. Do not include any reasoning, explanation, or extra text. \nAnswer (just one choice):\nAssistant: bank."),
 ({'id': 'a7ab086045575bb497933726e4e6ad28',
   'question': 'What do people aim to do at work?',
   'question_concept': 'people',
   'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
    'text': ['complete job',
     'learn from each other',
     'kill animals',
     'wear hats',
     'talk to each