In [1]:
import torch
print(torch.cuda.is_available())  # should be True
print(torch.rand(2,3).to("cuda"))

True
tensor([[0.8509, 0.6640, 0.4625],
        [0.2430, 0.5951, 0.3591]], device='cuda:0')


In [2]:
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
import torch
from PIL import Image
torch.backends.cudnn.enabled = False

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = LlavaForConditionalGeneration.from_pretrained(
                    "llava-hf/llava-1.5-7b-hf",
                    torch_dtype=torch.float16, 
                    low_cpu_mem_usage=True,
                    device_map="auto",
                    cache_dir="/home/chiragjishu/chirag/VLMs/llava-interp/models"
                )

processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.43s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [15]:
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
text_question = "Describe the image"
prompt = f"USER: <image>\n{text_question} ASSISTANT:"
image = Image.open("/home/chiragjishu/chirag/VLMs/llava-interp/image_folder/casualty.jpg")
inputs = processor(text=prompt, return_tensors = "pt")
image_token_id = processor.image_token_id

print(image_token_id)
sum=0
for i in inputs['input_ids'][0]:
    # print(i.item().type())
    if i.item()==image_token_id:
        sum+=1
print(sum)
print(inputs['input_ids'])


32000
1
tensor([[    1,  3148,  1001, 29901, 29871, 32000, 29871,    13,  4002, 29581,
           278,  1967,   319,  1799,  9047, 13566, 29901]])


In [9]:
sum

576

In [None]:
print(inputs['input_ids'])

In [5]:

output = model(**inputs)

In [7]:
print(output.keys())

odict_keys(['logits', 'past_key_values', 'image_hidden_states'])


## checking for medgemma's tokenizer

In [17]:
model_id = "google/medgemma-4b-it"
processor = AutoProcessor.from_pretrained(model_id)

image = Image.open("/home/chiragjishu/chirag/VLMs/llava-interp/image_folder/casualty.jpg") 
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are an expert first response medic."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this image"},
            {"type": "image", "image": image}
        ]
    }
]
inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True,
)
print(inputs['input_ids'])

[[2, 105, 2364, 107, 3048, 659, 614, 7710, 1171, 3072, 6992, 236761, 108, 82858, 672, 2471, 108, 255999, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144, 262144,

In [18]:
input_ids = inputs['input_ids'][0]
img_token_id = processor.image_token_id
token_labels = []
# image_token_count = 256
i = 0
for token_id in input_ids:
        if token_id == img_token_id:
            # One indexed because the HTML logic wants it that way
            # token_labels.extend([f"<IMG{(i+1):03d}>" for i in range(img_token_count)])
            token_labels.append(f"<IMG{(i+1):03d}>")
            i+=1
        else:
            token_labels.append(processor.tokenizer.decode([token_id]))
print(token_labels)

['<bos>', '<start_of_turn>', 'user', '\n', 'You', ' are', ' an', ' expert', ' first', ' response', ' medic', '.', '\n\n', 'Describe', ' this', ' image', '\n\n', '<start_of_image>', '<IMG001>', '<IMG002>', '<IMG003>', '<IMG004>', '<IMG005>', '<IMG006>', '<IMG007>', '<IMG008>', '<IMG009>', '<IMG010>', '<IMG011>', '<IMG012>', '<IMG013>', '<IMG014>', '<IMG015>', '<IMG016>', '<IMG017>', '<IMG018>', '<IMG019>', '<IMG020>', '<IMG021>', '<IMG022>', '<IMG023>', '<IMG024>', '<IMG025>', '<IMG026>', '<IMG027>', '<IMG028>', '<IMG029>', '<IMG030>', '<IMG031>', '<IMG032>', '<IMG033>', '<IMG034>', '<IMG035>', '<IMG036>', '<IMG037>', '<IMG038>', '<IMG039>', '<IMG040>', '<IMG041>', '<IMG042>', '<IMG043>', '<IMG044>', '<IMG045>', '<IMG046>', '<IMG047>', '<IMG048>', '<IMG049>', '<IMG050>', '<IMG051>', '<IMG052>', '<IMG053>', '<IMG054>', '<IMG055>', '<IMG056>', '<IMG057>', '<IMG058>', '<IMG059>', '<IMG060>', '<IMG061>', '<IMG062>', '<IMG063>', '<IMG064>', '<IMG065>', '<IMG066>', '<IMG067>', '<IMG068>', '<I

In [11]:
print(model)

LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): Q

In [16]:
print(model.language_model.model.norm.type)

<bound method Module.type of LlamaRMSNorm((4096,), eps=1e-05)>


In [None]:
hook_locn = "text_model_in"

In [2]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import requests
import torch

model_id = "google/medgemma-4b-it"

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
print(model)

Gemma3ForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4096, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
            

In [3]:
image = Image.open("/home/chiragjishu/chirag/VLMs/llava-interp/image_folder/casualty.jpg") 
messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are an expert first response medic."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this image"},
            {"type": "image", "image": image}
        ]
    }
]
inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)
# inputs  = processor(text=prompt, images=image, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs, output_hidden_states=True, max_new_tokens=250)
hidden_states = output.hidden_states
print(len(hidden_states))

35


In [10]:
hidden_states[4].shape

torch.Size([1, 281, 2560])

In [11]:
processor.image_token_id

262144

In [12]:
model.vision_tower.config

SiglipVisionConfig {
  "_attn_implementation_autoset": true,
  "attention_dropout": 0.0,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 1152,
  "image_size": 896,
  "intermediate_size": 4304,
  "layer_norm_eps": 1e-06,
  "model_type": "siglip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 27,
  "patch_size": 14,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "vision_use_head": false
}

In [13]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'pixel_values'])

In [26]:
print(len(inputs['input_ids'][0]))
sum=0
for i in inputs['input_ids'][0]:
    # print(i.item().type())
    if i.item()==262144:
        sum+=1
    


281


In [27]:
sum

256