In [1]:
import os

import torch
from torch.utils.data import DataLoader, Subset

from config import Config
from dataset import collate_fn, load_dataset

os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
torch.manual_seed(42)


def get_dataloader(
    name: str,
    sample_id: torch.Tensor,
    targets: dict,
    split="val",
    transform=None,
    shuffle=True,
    batch_size=5,
):
    # Set multi-target labels
    dataset = load_dataset(name, split=split, targets=targets, transform=transform)
    dataset = Subset(dataset, sample_id.flatten().tolist())
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn,
    )
    return dataloader


cfg = Config()
dataloader = get_dataloader(
    cfg.dataset_name,
    cfg.sample_id,
    cfg.targets,
    split=cfg.split,
    batch_size=cfg.batch_size,
)

# Example use
test_image = dataloader.dataset[0]["image"].unsqueeze(0)
test_image.shape

torch.Size([1, 3, 299, 299])

# Tokenizer Analysis

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"
from transformers import (
    AutoProcessor,
    Blip2ForConditionalGeneration,
    Blip2Processor,
    InstructBlipForConditionalGeneration,
    InstructBlipProcessor,
    LlavaNextProcessor,
)

model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
llava = AutoProcessor.from_pretrained(model_id)
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
llavanext = LlavaNextProcessor.from_pretrained(model_id)
model_id = "Qwen/Qwen2-VL-2B-Instruct"
qwen = AutoProcessor.from_pretrained(model_id)
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
qwen25 = AutoProcessor.from_pretrained(model_id)
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
llama = AutoProcessor.from_pretrained(model_id)
model_id = "Salesforce/blip2-opt-2.7b"
blip = Blip2Processor.from_pretrained(model_id)
model_id = "Salesforce/instructblip-vicuna-7b"
instructblip = InstructBlipProcessor.from_pretrained(model_id)
model_id = "openvla/openvla-7b"
openvla = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

processors = [llava, llavanext, qwen, qwen25, llama, blip, instructblip, openvla]


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
questions = ["Describe the iamge"] * 3
targets = ["ERROR!", "WARNING!", "MY TARGET!"]
prompts = []
for q in questions:
    conv = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": q},
                {"type": "image"},
            ],
        },
    ]
    conversation = llava.apply_chat_template(conv, add_generation_prompt=False)
    prompts.append(conversation)

prompts

['[INST] <image>\nDescribe the iamge [/INST]',
 '[INST] <image>\nDescribe the iamge [/INST]',
 '[INST] <image>\nDescribe the iamge [/INST]']

In [None]:
inputs = llava(
    images=test_image,
    text=prompts,
    return_tensors="pt",
    padding=True,
    do_rescale=False,  # the image is already rescaled to [0, 1]
)

In [69]:
def print_inputs(processor, add_generation_prompt=False):
    print("****************" * 5)
    eos_token = processor.tokenizer.eos_token
    print("eos_token:", eos_token)
    conv = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Can you tell me what this image is about?"},
                {"type": "image"},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": "My Answer."}],
        },
    ]
    prompt = processor.apply_chat_template(conv, add_generation_prompt=add_generation_prompt)
    print(prompt)
    print()
    inputs = processor(text=prompt, images=test_image, return_tensors="pt", padding=True, do_rescale=False)
    print(inputs.keys())
    print(inputs["input_ids"].shape)
    print((inputs["attention_mask"] == 1).all())
    return inputs


In [None]:
inputs = print_inputs(llava)
print(inputs["input_ids"][:, -10:])
inputs = print_inputs(llava, True)
print(inputs["input_ids"][:, -10:])


********************************************************************************
eos_token: </s>
[INST] <image>
Can you tell me what this image is about? [/INST] My Answer.</s> 

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_sizes'])
torch.Size([1, 1202])
tensor(True)
tensor([[28804,   733, 28748, 16289, 28793,  1984, 26307, 28723,     2,   259]])
********************************************************************************
eos_token: </s>
[INST] <image>
Can you tell me what this image is about? [/INST] My Answer.</s> 

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_sizes'])
torch.Size([1, 1202])
tensor(True)
tensor([[28804,   733, 28748, 16289, 28793,  1984, 26307, 28723,     2,   259]])


In [None]:
inputs = print_inputs(llavanext)

inputs = print_inputs(llavanext, True)

********************************************************************************
eos_token: </s>
[INST] <image>
Can you tell me what this image is about? [/INST] My Answer.</s> 

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_sizes'])
torch.Size([1, 1202])
tensor(True)
********************************************************************************
eos_token: </s>
[INST] <image>
Can you tell me what this image is about? [/INST] My Answer.</s> 

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_sizes'])
torch.Size([1, 1202])
tensor(True)


In [147]:
inputs = print_inputs(qwen)
print(inputs["image_grid_thw"].shape)
inputs = print_inputs(qwen, True)
print(inputs["image_grid_thw"].shape)

********************************************************************************
eos_token: <|im_end|>
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you tell me what this image is about?<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
My Answer.<|im_end|>


dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
torch.Size([1, 157])
tensor(True)
torch.Size([1, 3])
********************************************************************************
eos_token: <|im_end|>
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you tell me what this image is about?<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
My Answer.<|im_end|>
<|im_start|>assistant


dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
torch.Size([1, 160])
tensor(True)
torch.Size([1, 3])


In [102]:
print_inputs(qwen25)
print_inputs(qwen25, True)

********************************************************************************
eos_token: <|im_end|>
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you tell me what this image is about?<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
My Answer.<|im_end|>


dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
torch.Size([1, 157])
tensor(True)
********************************************************************************
eos_token: <|im_end|>
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Can you tell me what this image is about?<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
My Answer.<|im_end|>
<|im_start|>assistant


dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
torch.Size([1, 160])
tensor(True)


In [155]:
inputs = print_inputs(llama)
print(inputs["cross_attention_mask"].shape)
inputs = print_inputs(llama, True)
print(inputs["cross_attention_mask"].shape)

********************************************************************************
eos_token: <|eot_id|>
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you tell me what this image is about?<|image|><|eot_id|><|start_header_id|>assistant<|end_header_id|>

My Answer.<|eot_id|>

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'])
torch.Size([1, 26])
tensor(True)
torch.Size([1, 26, 1, 4])
********************************************************************************
eos_token: <|eot_id|>
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you tell me what this image is about?<|image|><|eot_id|><|start_header_id|>assistant<|end_header_id|>

My Answer.<|eot_id|><|start_header_id|>assistant<|end_header_id|>



dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'])
torch.Size([1, 30])
tensor(True)
torch.Size([1, 30, 1, 4])


In [105]:
print(openvla.chat_template)


None


# Analyze Target Token

In [90]:
processor = llama
conv = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Can you tell me what this?"},
            {"type": "image"},
        ],
    },
]

prompt = processor.apply_chat_template(conv, add_generation_prompt=True)
print(prompt)
inputs0 = processor(images=test_image, text=prompt, return_tensors="pt", padding=True, do_rescale=False)
print(inputs0.keys())
print(inputs0["input_ids"][:, -10:])


conv.append(
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Target"},
        ],
    }
)
prompt = processor.apply_chat_template(conv, add_generation_prompt=False)
print(prompt)
inputs1 = processor(images=test_image, text=prompt, return_tensors="pt", padding=True, do_rescale=False)
print(inputs1["input_ids"][:, -10:])

print("***************")
for key in inputs1.keys():
    try:
        print((inputs0[key] == inputs1[key]).all())
    except Exception:
        print("Error", key)
# target = "Target" + processor.tokenizer.eos_token
# inputs = processor.tokenizer.encode(target, add_special_tokens=False)
# print(inputs)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you tell me what this?<|image|><|eot_id|><|start_header_id|>assistant<|end_header_id|>


dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'])
tensor([[   757,   1148,    420,     30, 128256, 128009, 128006,  78191, 128007,
            271]])
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you tell me what this?<|image|><|eot_id|><|start_header_id|>assistant<|end_header_id|>

Target<|eot_id|>
tensor([[   420,     30, 128256, 128009, 128006,  78191, 128007,    271,   6531,
         128009]])
***************
Error input_ids
Error attention_mask
tensor(True)
tensor(True)
tensor(True)
Error cross_attention_mask


In [None]:
(inputs0["cross_attention_mask"] == inputs1["cross_attention_mask"][:, :19]).all()

tensor(True)

In [None]:
inputs1["cross_attention_mask"][:, 19:]

tensor([[[[1, 0, 0, 0]],

         [[1, 0, 0, 0]]]])

In [113]:
openvla.tokenizer.eos_token_ids

2

In [None]:
def print_inputs(processor, add_generation_prompt=True, target="Target"):
    print("****************" * 5)
    eos_token = processor.tokenizer.eos_token
    target_token = target + eos_token
    print("eos_token:", eos_token)
    conv = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Can you tell me what this image is about?"},
                {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conv, add_generation_prompt=add_generation_prompt)
    print(prompt)
    print()
    inputs = processor(text=prompt, images=test_image, return_tensors="pt", padding=True, do_rescale=False)
    print(inputs.keys())
    print(inputs["input_ids"].shape)
    print((inputs["attention_mask"] == 1).all())

## Analyze the (Instruct) Blip 

In [91]:
print(blip.chat_template)
print(instructblip.chat_template)

None
None


In [181]:
prompt = "What is unusual about this image? Answer:"
inputs = blip(images=test_image, text=prompt, return_tensors="pt", do_rescale=False).to("cuda")
print(inputs.keys())
print(blip.batch_decode(inputs["input_ids"]))
# inputs = instructblip(images=test_image, text=prompt, return_tensors="pt", do_rescale=False)
# print(inputs.keys())
# print(instructblip.batch_decode(inputs["input_ids"]))

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])
['<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s>What is unusual about this image? Answer:']


In [None]:
model_blip = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", device_map="auto", revision="51572668da0eb669e01a189dc22abe6088589a24"
)


In [184]:
output = model_blip.generate(**inputs, max_new_tokens=50)

Both `max_new_tokens` (=50) and `max_length`(=51) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [185]:
blip.batch_decode(output)

["<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s>What is unusual about this image? Answer: It's not a picture of a person.\n"]

In [None]:
model_instructblip = InstructBlipForConditionalGeneration.from_pretrained(
    "Salesforce/instructblip-vicuna-7b", device_map="auto"
)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]Error while downloading from https://cdn-lfs.hf-mirror.com/repos/fa/52/fa523532eb768e2126266fb7e4f5eeac3f3069f77eab6fb9ef6ef02b07cc2ed5/46d674bda7114f639c411b6341716c03e0f821407b1c4cd58f28aa1c64b34481?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00004-of-00004.safetensors%3B+filename%3D%22model-00004-of-00004.safetensors%22%3B&Expires=1742981724&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0Mjk4MTcyNH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9mYS81Mi9mYTUyMzUzMmViNzY4ZTIxMjYyNjZmYjdlNGY1ZWVhYzNmMzA2OWY3N2VhYjZmYjllZjZlZjAyYjA3Y2MyZWQ1LzQ2ZDY3NGJkYTcxMTRmNjM5YzQxMWI2MzQxNzE2YzAzZTBmODIxNDA3YjFjNGNkNThmMjhhYTFjNjRiMzQ0ODE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=J4-BIhABMpasbci0NU3cAXyqivoMQGiZE60nY1AbgCXmFCvKWOHeA%7EFWWgaQR6qKO-KPaIUGhB3TfFSnrwvSlNeGWOd8z-qK6v1dy6XVPZ-uDnpXgNCRbipME-XgOQ%7E7HdUKl-FZxWcDnLuopTiD-E2wvxYwhzN0dkq05kPgxL

In [61]:
prompt = "Question: Describe the image. Answer:"

inputs = instructblip(images=test_image, text=prompt, return_tensors="pt", do_rescale=False).to("cuda")
print(inputs.keys())
print(instructblip.batch_decode(inputs["input_ids"]))

output = model_instructblip.generate(**inputs, max_new_tokens=50)
print(instructblip.batch_decode(output))



dict_keys(['input_ids', 'attention_mask', 'qformer_input_ids', 'qformer_attention_mask', 'pixel_values'])
['<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s> Question: Describe the image. Answer:']
['<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s> Question: Describe the image. Answer: yes</s><s>']


In [42]:
instructblip.tokenizer.encode(instructblip.tokenizer.eos_token, add_special_tokens=False)

[2]

In [None]:
instructblip.tokenizer.batch_decode(inputs["input_ids"])

['<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s> Question: Describe the image. Answer: ERROR!</s>']

In [114]:
inputs["input_ids"]

tensor([[128000, 128000, 128006,    882, 128007,    271,   6854,    499,   3371,
            757,   1148,    420,     30, 128256, 128009, 128006,  78191, 128007,
            271,   6531, 128009]])

In [63]:
target_id = instructblip.tokenizer.encode("ERROR!", add_special_tokens=False)
eos_id = instructblip.tokenizer.encode(instructblip.tokenizer.eos_token, add_special_tokens=False)
target_id = target_id + eos_id
target_id = torch.tensor(target_id).unsqueeze(0).to("cuda")
print(target_id)

tensor([[14431, 29991,     2]], device='cuda:0')


In [121]:
input_id = instructblip.tokenizer("Question: Describe the image.", return_tensors="pt", add_special_tokens=False)


In [None]:
input_id["input_ids"]

tensor([[  894, 29901, 20355,   915,   278,  1967, 29889, 14431, 29991]])

In [139]:
target_ids = instructblip.tokenizer.encode("ERROR!", add_special_tokens=False)

In [None]:
target_ids + [4]

TypeError: can only concatenate list (not "int") to list

In [None]:
input_id["input_ids"] = torch.cat([input_id["input_ids"], torch.tensor([target_ids])], dim=1)

In [None]:
instructblip.tokenizer.decode(inputs["input_ids"][0])

'<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s> Question: Describe the image. Answer: ERROR!</s>'

In [None]:
inputs["attention_mask"] = torch.ones_like(inputs["input_ids"])

In [68]:
output = model_instructblip.generate(**inputs, max_new_tokens=50)
print(instructblip.batch_decode(output))



['<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image></s> Question: Describe the image. Answer: ERROR!</s><s>']
