In [1]:
import torch
from transformers import AutoProcessor, AutoModel, CLIPModel, CLIPProcessor

model: CLIPModel = AutoModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.bfloat16, attn_implementation="sdpa")
processor: CLIPProcessor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [61]:
texts = ["a gray cat", "a brown cat", "a blue cat"]
token_ids = processor(text=texts, images=None, return_tensors="pt", padding=True).input_ids
token_ids.tolist()

[[49406, 320, 7048, 2368, 49407],
 [49406, 320, 2866, 2368, 49407],
 [49406, 320, 1746, 2368, 49407]]

In [62]:
encoder_outputs = model.text_model(token_ids, output_hidden_states=True)

pooler_output = encoder_outputs.pooler_output
last_hidden_state = encoder_outputs.last_hidden_state

print(pooler_output.shape)
print(last_hidden_state.shape)


torch.Size([3, 512])
torch.Size([3, 5, 512])


In [64]:
processor.tokenizer.tokenize(" ".join(texts))

['a</w>',
 'gray</w>',
 'cat</w>',
 'a</w>',
 'brown</w>',
 'cat</w>',
 'a</w>',
 'blue</w>',
 'cat</w>']

In [65]:
print(last_hidden_state[0])

tensor([[ 0.3379,  0.1177,  0.1021,  ...,  0.2480,  0.5898,  0.1011],
        [ 1.9688, -0.5977,  0.3770,  ...,  1.1719,  0.7930, -0.9922],
        [-1.1953, -0.7812, -0.2344,  ...,  0.3945,  0.0289, -1.9844],
        [ 1.1953, -1.5312,  0.3145,  ...,  0.0593,  0.6992, -0.1040],
        [ 0.2402, -1.6875,  0.5234,  ...,  0.6133, -0.2471,  0.1348]],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)


In [66]:
print(last_hidden_state[1])

tensor([[ 0.3379,  0.1177,  0.1021,  ...,  0.2480,  0.5898,  0.1011],
        [ 1.9688, -0.5977,  0.3770,  ...,  1.1719,  0.7930, -0.9922],
        [ 0.4688, -0.3223,  0.8359,  ...,  1.7422, -1.5391, -3.2344],
        [ 1.0547, -0.3945,  1.1875,  ...,  0.1758, -1.0156, -1.6719],
        [-0.4844, -0.1895,  0.3535,  ...,  0.3770, -1.3594, -2.3438]],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)


In [67]:
print(last_hidden_state[2])

tensor([[ 0.3379,  0.1177,  0.1021,  ...,  0.2480,  0.5898,  0.1011],
        [ 1.9688, -0.5977,  0.3770,  ...,  1.1719,  0.7930, -0.9922],
        [ 1.2578, -1.6406, -2.0625,  ..., -0.2715,  0.5508, -1.7188],
        [ 0.7539, -1.3984, -0.1396,  ..., -0.0053, -0.3633, -1.1484],
        [-0.0625, -1.3984, -1.1328,  ..., -0.1904, -0.3281, -0.0801]],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)


In [None]:

import requests
from PIL import Image
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]

inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
most_likely_idx = probs.argmax(dim=1).item()
most_likely_label = labels[most_likely_idx]
print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item()})