In [19]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

Label probs: [[0.99279356 0.00421071 0.00299575]]


In [22]:
image_features.shape
# So the OPENAI CLIP model encode an image into a single 512 dim vector

torch.Size([1, 512])

In [29]:
# Breakdown of the CLIPVisionTower class object
from transformers import CLIPVisionModel, CLIPImageProcessor 
from PIL import Image

img = Image.open("CLIP.png")

vision_tower_name = "openai/clip-vit-base-patch32"
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
vision_tower = CLIPVisionModel.from_pretrained(vision_tower_name)

image = image_processor(img)
# vision_tower(image)

In [50]:
# transformer libray --> CLIPVisionModel & CLIPImageProcessor
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPVisionModel, CLIPImageProcessor

model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
image = Image.open("CLIP.png")


inputs = processor(images=image, return_tensors="pt")

outputs = model(**inputs, output_hidden_states=True)
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output  # pooled CLS states

In [24]:
from clip_encoder import CLIPVisionTower

# Initialize CLIPVisionTower
vision_tower_name = "openai/clip-vit-base-patch32"
args = type('Args', (), {
    'mm_vision_select_layer': -2,  # Typically the second to last layer
    'mm_vision_select_feature': "patch"
})()

clip_vision_tower = CLIPVisionTower(vision_tower_name, args)

# Load the model
clip_vision_tower.load_model()

# Now the CLIPVisionTower is initialized and ready to use
print(f"CLIPVisionTower initialized with vision tower: {clip_vision_tower.vision_tower_name}")
print(f"Select layer: {clip_vision_tower.select_layer}")
print(f"Select feature: {clip_vision_tower.select_feature}")

# Inference with the CLIPVisionTower class object
from PIL import Image
image = Image.open("CLIP.png")
inputs = clip_vision_tower.image_processor(image, return_tensors="pt")
tensors = inputs["pixel_values"]

Loading vision tower: openai/clip-vit-base-patch32
openai/clip-vit-base-patch32 is already loaded, `load_model` called again, skipping.
CLIPVisionTower initialized with vision tower: openai/clip-vit-base-patch32
Select layer: -2
Select feature: patch
