In [2]:
from typing import Literal, TypedDict
from PIL.Image import Image

class DatasetEntry(TypedDict):
    images: Image
    contest_number: int
    problem: str
    answer: Literal["A", "B", "C", "D", "E"]
    task: Literal["matching", "ranking"]

class DatasetEntryResult(TypedDict):
    contest_number: int
    problem: str
    correct_answer: Literal["A", "B", "C", "D", "E"]
    model_answer: str
    task: Literal["matching", "ranking"]
    is_correct: bool

In [3]:
from datasets import load_dataset

dataset_name = "newyccku/caption_dataset_rl_v2"
split = "test"
dataset = load_dataset(dataset_name, split=split)

ModuleNotFoundError: No module named 'datasets'

In [3]:
ranking_dataset = dataset.filter(lambda x: x["task"] == "ranking")
ranking_entries = [
    DatasetEntry(
        images=x["images"],
        contest_number=x["contest_number"],
        problem=x["problem"],
        answer=x["answer"],
        task=x["task"]
    )
    for x in ranking_dataset
]

matching_dataset = dataset.filter(lambda x: x["task"] == "matching")
matching_entries = [
    DatasetEntry(
        images=x["images"],
        contest_number=x["contest_number"],
        problem=x["problem"],
        answer=x["answer"],
        task=x["task"]
    )
    for x in matching_dataset
]

In [None]:
# Apply the same format to one of our dataset entries
from pathlib import Path
import tempfile


from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V-FP8")
model = AutoModelForImageTextToText.from_pretrained("zai-org/GLM-4.5V-FP8")

# Pick a sample (matching or ranking)
sample = (matching_entries)[0]
img = sample["images"]  # PIL.Image.Image
question = sample["problem"] + " Only answer with a single letter, such as A or B."

# Save the image to a temporary file so we can reference it via a URL-like path
with tempfile.TemporaryDirectory() as td:
    tmp_path = Path(td) / "entry.png"
    img.save(tmp_path)
    img_url = tmp_path.as_uri()  # file://...

    msgs = [
        {
            "role": "user",
            "content": [
                {"type": "image", "url": img_url},
                {"type": "text", "text": question},
            ],
        }
    ]

    inps = processor.apply_chat_template(
        msgs,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    outs = model.generate(**inps, max_new_tokens=64)
    print(processor.decode(outs[0][inps["input_ids"].shape[-1]:]).strip())

ValueError: The checkpoint you are trying to load has model type `glm4v_moe` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git`