In [7]:
from datasets import load_dataset

dataset_id = "HuggingFaceM4/ChartQA"
train_dataset, eval_dataset, test_dataset = load_dataset(dataset_id, split=["train[:2]", "val[:2]", "test[:2]"])

In [1]:
import torch
from transformers import Idefics3ForConditionalGeneration, AutoProcessor

model_id = "HuggingFaceTB/SmolVLM-Instruct"

In [27]:
model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="cpu",
    torch_dtype=torch.bfloat16,
    #_attn_implementation="flash_attention_2",
)

processor = AutoProcessor.from_pretrained(model_id)

In [28]:
from pathlib import Path
import pymupdf
from typing import List, Optional

import PIL.PngImagePlugin
from PIL import Image


def convert_pdf_to_images(pdf: Path, dpi: int = 150) -> List[PIL.PngImagePlugin.PngImageFile]:
    """Save PDF as pngs and encode them as base64."""
    doc, images = pymupdf.open(pdf), []
    for i, page in enumerate(doc):
        png_path = pdf.parent / f"{pdf.stem}_{i}.png"
        page.get_pixmap(dpi=dpi).save(png_path)
        image = Image.open(png_path)
        images.append(image)

    return images


def construct_messages(query: str, images: List[PIL.PngImagePlugin.PngImageFile], system: Optional[str] = None) -> List[dict]:
    """Construct messages for the model."""
    query_content = [{"type": "image", "image": img} for img in images]
    query_content.append({"type": "text", "text": query})
    messages = [{"role": "user", "content": query_content}]

    if system is not None:
        messages.insert(0, {"role": "system", "content": system})

    return messages

In [29]:
pdf_path = Path("data/PLOS_1000/pbio.2002780/pbio.2002780.pdf")

images = convert_pdf_to_images(pdf_path)
messages = construct_messages(
    query="Extract all references from the images and return them in the following JSON format: {'references': [{'title': '...', 'authors': '...', 'year': '...'}]}",
    images=images,
)
text_input = processor.apply_chat_template(messages)

In [30]:
model_inputs = processor(text=[text_input], images=[images], return_tensors="pt")

In [None]:
generated_ids = model.generate(**model_inputs)

In [15]:
sample = train_dataset[0]
img = sample["image"]
print(img), print(Image.open("data/PLOS_1000/pbio.2002780/pbio.2002780_0.png"))

<PIL.PngImagePlugin.PngImageFile image mode=RGB size=422x359 at 0x7C69A07625F0>
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1275x1650 at 0x7C69A05562C0>


(None, None)

In [16]:
pdf = Path("data/PLOS_1000/pbio.2002780/pbio.2002780.pdf")
pdf.parent

PosixPath('data/PLOS_1000/pbio.2002780')

In [25]:
import pymupdf

doc = pymupdf.open(pdf)
for i, page in enumerate(doc):
    png_path = pdf.parent / f"{pdf.stem}_{i}.png"
    page.get_pixmap(dpi=150).save(png_path)

In [6]:
from PIL import Image

type(Image.open("data/PLOS_1000/pbio.2002780/pbio.2002780_0.png"))

PIL.PngImagePlugin.PngImageFile

In [None]:
def format_data(sample):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": sample["query"],
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["label"][0]}],
        },
    ]