In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration

model_id = "nvidia/Llama-3.2-11B-Vision-Surgical-CholecT50"

# Load model + processor
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

# Build a chat with an image turn
img = Image.open("/shared_data0/weiqiuy/real_drs/data/abdomen_exlib/images/cholec80_video20_006.png").convert("RGB")

messages = [{
    "role": "user",
    "content": [
        {"type": "image"},
        {"type": "text", "text": "Detect presence of gallbladder and describe its location if visible."}
    ],
}]

# 1) Build the chat prompt text (no tokenization yet)
prompt = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
)

# 2) Now tokenize text + image together
inputs = processor(
    text=[prompt],          # keep batched shapes
    images=[img],
    return_tensors="pt",
).to(model.device)

with torch.no_grad():
    out_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)

print(processor.batch_decode(out_ids, skip_special_tokens=True)[0])

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

user

Detect presence of gallbladder and describe its location if visible.assistant

The gallbladder is likely to be located in the upper right quadrant of the image, as this is the typical anatomical location for the gallbladder. It may appear as a pear-shaped organ, possibly with a greenish hue due to the bile it contains. The gallbladder's location would be indicated by the retraction of the surrounding tissues, which would expose the gallbladder's surface.
