- https://arxiv.org/pdf/2409.12191

In [1]:
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    Qwen2VLProcessor,
    Qwen2VLImageProcessor,
)

# Load the model in half-precision on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
# )
processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

inputs = processor(
    text=[text_prompt],
    images=[image],
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# # Inference: Generation of the output
# output_ids = model.generate(**inputs, max_new_tokens=128)
# generated_ids = [
#     output_ids[len(input_ids) :]
#     for input_ids, output_ids in zip(inputs.input_ids, output_ids)
# ]
# output_text = processor.batch_decode(
#     generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
# )
# print(output_text)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
print(text_prompt)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>
<|im_start|>assistant



In [2]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])

In [None]:
inputs["input_ids"].tolist()

[[151644,
  8948,
  198,
  2610,
  525,
  264,
  10950,
  17847,
  13,
  151645,
  198,
  151644,
  872,
  198,
  151652,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655,
  151655

In [None]:
print(processor.tokenizer.decode(inputs["input_ids"][0]))

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|

In [3]:
inputs["pixel_values"].shape

torch.Size([14308, 1176])

In [6]:
import torch

self = torch.tensor(
    [
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
    ]
)
mask = torch.tensor(
    [
        [0, 0, 0, 0, 1],
        [1, 1, 0, 1, 1],
    ],
    dtype=torch.bool,
)
source = torch.tensor(
    [
        [0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9],
    ]
)
self.masked_scatter_(mask, source)

tensor([[0, 0, 0, 0, 0],
        [1, 2, 0, 3, 4]])

In [2]:
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs

{'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[ 0.8501,  0.8501,  0.8647,  ...,  1.3922,  1.3922,  1.3922],
        [ 0.9376,  0.9376,  0.9376,  ...,  1.4491,  1.4491,  1.4491],
        [ 0.9084,  0.9376,  0.9376,  ...,  1.4065,  1.4207,  1.4207],
        ...,
        [-0.1280, -0.1280, -0.1426,  ..., -0.2431, -0.2715, -0.3000],
        [-0.3324, -0.3324, -0.3032,  ..., -0.3000, -0.2715, -0.2857],
        [-0.3762, -0.4054, -0.4054,  ..., -0.4279, -0.4422, -0.4564]]), 'image_grid_thw': tensor([[  1,  98, 146]])}