In [1]:
import torch

from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = PaliGemmaForConditionalGeneration.from_pretrained(
    "paligemma-3b-pt-224",
    torch_dtype=torch.bfloat16,
    revision="bfloat16",
).to(DEVICE)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [02:18<00:00, 46.05s/it]


In [3]:
processor = AutoProcessor.from_pretrained("paligemma-3b-pt-224")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
from PIL import Image

prompt = "caption es"
image = "images/car.jpg"
img = Image.open(image).convert("RGB")
model_inputs = processor(text=prompt, images=img, return_tensors="pt").to(DEVICE)

You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


In [5]:
input_len = model_inputs["input_ids"].shape[-1]

generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
generation = generation[0][input_len:]
decoded = processor.decode(generation, skip_special_tokens=True)

pixel_values tensor([[[[-0.4039, -0.3098, -0.2706,  ..., -0.8431, -0.8902, -0.9294],
          [-0.3961, -0.3412, -0.4353,  ..., -0.8980, -0.9216, -0.9451],
          [-0.5059, -0.4275, -0.3961,  ..., -0.9294, -0.9451, -0.9451],
          ...,
          [ 0.2000,  0.2000,  0.2078,  ...,  0.1922,  0.1608,  0.1451],
          [ 0.2157,  0.2157,  0.2235,  ...,  0.2157,  0.1922,  0.1765],
          [ 0.2314,  0.2314,  0.2235,  ...,  0.1373,  0.1373,  0.1451]],

         [[-0.5608, -0.4667, -0.4039,  ..., -0.7569, -0.8039, -0.8431],
          [-0.5608, -0.4980, -0.5765,  ..., -0.8118, -0.8353, -0.8588],
          [-0.6706, -0.5843, -0.5451,  ..., -0.8431, -0.8588, -0.8588],
          ...,
          [ 0.1529,  0.1529,  0.1608,  ...,  0.1529,  0.1216,  0.1059],
          [ 0.1686,  0.1686,  0.1765,  ...,  0.1765,  0.1529,  0.1373],
          [ 0.1843,  0.1843,  0.1765,  ...,  0.0980,  0.0980,  0.1059]],

         [[-0.5686, -0.4745, -0.4275,  ..., -0.7882, -0.8353, -0.8745],
          [-0.568

In [6]:
decoded

'persona estacionada en una calle'