In [None]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests

model_id = "NYU-OLAB/LLaVA-Next-Med-OLAB"

processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

# Load epidural hematoma CT from Radiopaedia
url = "https://prod-images-static.radiopaedia.org/images/64765614/eb6541731e66f04fc1e3a544fe55a7935646d39f886bee9aae3da8320c29b165.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What abnormality is shown in this CT scan?"},
            {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)

output = model.generate(**inputs, max_new_tokens=100)
print(processor.decode(output[0], skip_special_tokens=True))


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 31 files:   0%|          | 0/31 [00:00<?, ?it/s]

model-00011-of-00031.safetensors:  49%|####8     | 4.25G/8.71G [00:00<?, ?B/s]

model-00008-of-00031.safetensors:  27%|##7       | 1.69G/6.16G [00:00<?, ?B/s]

model-00006-of-00031.safetensors:  27%|##6       | 1.62G/6.08G [00:00<?, ?B/s]

model-00001-of-00031.safetensors:  22%|##2       | 1.42G/6.35G [00:00<?, ?B/s]

model-00007-of-00031.safetensors:  40%|###9      | 2.93G/7.39G [00:00<?, ?B/s]

model-00005-of-00031.safetensors:  30%|##9       | 1.90G/6.36G [00:00<?, ?B/s]

model-00003-of-00031.safetensors:  32%|###2      | 2.13G/6.60G [00:00<?, ?B/s]

model-00013-of-00031.safetensors:   0%|          | 0.00/4.46G [00:00<?, ?B/s]

In [None]:
1