In [1]:
from transformers import CLIPModel, CLIPProcessor, CLIPFeatureExtractor, CLIPVisionModel
import torch.nn as nn
import torch

import requests
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CustomCLIPModel(nn.Module):
    def __init__(self, base_model, num_classes, hidden_size):
        super().__init__()
        self.base_model = base_model
        self.text_projection = nn.Linear(512, 512)
        self.visual_projection = nn.Linear(512, 512)
        self.combine_projections = nn.Sequential(
            nn.Linear(512, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, return_dict=True)
        text_proj = self.text_projection(outputs.text_embeds)
        visual_proj = self.visual_projection(outputs.image_embeds)
        combined = torch.cat((text_proj, visual_proj), dim=-1)
        logits = self.combine_projections(combined)
        return logits

In [2]:
# Load the pre-trained CLIP model
base_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
vision_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
# Define the number of output classes and hidden size for MLP
# num_classes = 10
# hidden_size = 128

# # Initialize the custom model
# model = CustomCLIPModel(base_model, num_classes, hidden_size)

Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.3.layer_norm1.weight', '

In [7]:
transform = CLIPFeatureExtractor()



In [23]:
base_model.config.text_config.hidden_size

512

In [3]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [6]:
image_np = transform(image, return_tensors="np")['pixel_values'][0].transpose(1, 2, 0)
print('gi')

NameError: name 'transform' is not defined

In [19]:
inputs = processor(text=["a photo o as aassf a cat", "asd asd ad sad sadad a"], images=image, return_tensors="pt", padding=True)

In [21]:
inputs['input_ids'].shape

torch.Size([2, 10])

In [17]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])

In [21]:
vis_output = vision_model(inputs['pixel_values'])

In [22]:
vis_output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [25]:
vis_output['last_hidden_state'].shape

torch.Size([1, 50, 768])

In [26]:
vis_output['pooler_output'].shape

torch.Size([1, 768])

In [24]:
outputs = base_model(pixel_values=inputs['pixel_values'], input_ids=inputs['input_ids'])

In [25]:
outputs.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [26]:
outputs.text_embeds

tensor([[-0.0085, -0.0064, -0.0111,  ..., -0.0306, -0.0460, -0.0039],
        [-0.0177,  0.0063,  0.0113,  ...,  0.0069, -0.0118,  0.0119]],
       grad_fn=<DivBackward0>)

In [None]:
base_model[]

In [15]:
for p in base_model.parameters():
    p.requires_grad = False

In [25]:
outputs['text_embeds']

tensor([[ 0.0148,  0.0070, -0.0234,  ..., -0.0508, -0.0438,  0.0033],
        [ 0.0087,  0.0258, -0.0387,  ..., -0.0547, -0.0242,  0.0112]],
       grad_fn=<DivBackward0>)

In [12]:
outputs['image_embeds']

tensor([[-9.7877e-03,  1.2770e-02, -2.7419e-02,  1.9675e-03, -5.9326e-03,
         -1.5613e-02, -1.2514e-02, -2.2667e-04,  4.3869e-02, -1.6322e-02,
          2.2630e-02, -3.5160e-02,  4.4748e-03, -1.2946e-02, -3.1524e-02,
         -1.1737e-02, -2.1543e-02, -2.7556e-02,  1.6562e-02,  4.5935e-03,
         -1.2106e-01, -3.0035e-03,  3.9024e-02, -3.0893e-02, -4.3866e-03,
          2.7598e-02,  2.2140e-02, -1.7065e-02,  1.4509e-02, -4.5195e-03,
         -7.1843e-03,  2.3971e-02, -6.8107e-03,  1.6382e-02, -5.3629e-02,
         -4.5550e-04,  2.5840e-02, -2.6581e-02,  1.7667e-02,  3.0216e-02,
         -9.3064e-03, -3.2082e-02,  6.6351e-04, -1.3654e-02, -1.7603e-02,
          5.3115e-05,  4.8170e-02,  1.3997e-02, -8.4859e-03,  1.6292e-02,
          1.5116e-02,  2.3294e-02,  1.0750e-02, -4.9806e-03,  2.1177e-02,
          1.7230e-02,  2.3855e-02,  5.5848e-02, -2.3911e-02, -1.5538e-02,
          3.9247e-02, -1.2918e-02, -5.9023e-03,  3.2934e-02, -6.8747e-03,
         -2.5089e-02,  2.0757e-02,  1.

In [29]:
outputs['text_embeds'].shape

torch.Size([2, 512])

In [28]:
outputs['image_embeds'].shape

torch.Size([1, 512])

In [8]:
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

In [9]:
logits_per_image

tensor([[24.5701, 19.3049]], grad_fn=<TBackward0>)

In [10]:
from transformers import (CLIPFeatureExtractor, 
                          CLIPTokenizer,
                          CLIPProcessor,
                          )

In [19]:
inputs['pixel_values'].shape
print('hi')

hi


In [13]:
CLIPTokenizer

transformers.models.clip.tokenization_clip.CLIPTokenizer

In [1]:
import clip
clip_model, preprocess = clip.load("RN50x4", jit=False)

100%|███████████████████████████████████████| 402M/402M [01:16<00:00, 5.53MiB/s]


In [14]:
import numpy as np
import torch
image_np = torch.tensor(np.array(image))

In [18]:
image_np[None,:,:,:].shape

torch.Size([1, 480, 640, 3])

In [21]:
image_np.view(1, 3, 480, 640).shape

torch.Size([1, 3, 480, 640])