In [1]:
# Load model directly
from transformers import AutoProcessor, BlipForImageTextRetrieval
from PIL import Image
import torch
import os
import json


# Load the model and processor
#model_name = "HuggingFaceTB/SmolVLM-Base" #model card for image text matching
model_name = "Salesforce/blip-itm-base-coco" #model card for image text matching


model = BlipForImageTextRetrieval.from_pretrained(model_name)
model.config.with_projection = True #get the projections for the image and text
processor = AutoProcessor.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BlipForImageTextRetrieval(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-05, 

In [2]:
print(model.forward.__doc__)


   The [`BlipForImageTextRetrieval`] forward method, overrides the `__call__` special method.

    <Tip>

    Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]
    instance afterwards instead of this since the former takes care of running the pre and post processing steps while
    the latter silently ignores them.

    </Tip>

    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidde

In [3]:
model.config.vision_config.with_projection = True
model.config.text_config.with_projection = True


In [4]:
model.config

BlipConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "Salesforce/blip-itm-base-coco",
  "architectures": [
    "BlipForImageTextRetrieval"
  ],
  "image_text_hidden_size": 256,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "label_smoothing": 0.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "blip",
  "projection_dim": 512,
  "text_config": {
    "_attn_implementation_autoset": true,
    "initializer_factor": 1.0,
    "model_type": "blip_text_model",
    "num_attention_heads": 12,
    "with_projection": true
  },
  "torch_dtype": "float32",
  "transformers_version": "4.46.2",
  "vision_config": {
    "_attn_implementation_autoset": true,
    "dropout": 0.0,
    "initializer_factor": 1.0,
    "initializer_range": 0.02,
    "model_type": "blip_vision_model",
    "num_channels": 3,
    "with_projection": true
  },
  "with_projection": true
}

In [5]:
#create batches
#load metadata
meta_data_path = 'data/dataset_coco.json'
with open(meta_data_path, 'r') as f:
    meta_data = json.load(f)


#image path 
path_to_dataset_folder = '/Users/doruktarhan/Desktop/MSCOCO_Dataset' #dataset images folder path

meta_data['images'][0].keys()



dict_keys(['filepath', 'sentids', 'filename', 'imgid', 'split', 'sentences', 'cocoid'])

In [6]:
import matplotlib.pyplot as plt
from tqdm import tqdm
image_caption_pairs = []

for image_data in tqdm(meta_data['images'], desc="Processing images and captions"):
    #save the image id
    image_id = image_data['imgid']

    #save the image path
    image_folder = image_data['filepath']
    image_name = image_data['filename']
    image_path = os.path.join(path_to_dataset_folder, image_folder)
    image_path = os.path.join(image_path, image_name)

    #get the image
    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}")
        continue
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        continue
    
    for sentence_meta_data in image_data['sentences']:
        #get the sentence_id
        sentence_id = sentence_meta_data['sentid']
        #get the sentence
        caption = sentence_meta_data['raw']
        image_caption_pairs.append((image_id, sentence_id, image_path, caption))
        

    
print(f"Total image-caption pairs: {len(image_caption_pairs)}")
    

Processing images and captions: 100%|██████████| 123287/123287 [00:25<00:00, 4890.38it/s]

Total image-caption pairs: 616767





In [7]:
from torch.nn.functional import normalize


imgid,sentid, image_path, caption =  image_caption_pairs[0]
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, text=caption, return_tensors="pt").to(device)

inputs.keys()






dict_keys(['pixel_values', 'input_ids', 'attention_mask'])

In [8]:
inputs['pixel_values'].shape, inputs['input_ids'].shape, inputs['attention_mask'].shape

(torch.Size([1, 3, 384, 384]), torch.Size([1, 18]), torch.Size([1, 18]))

In [9]:
inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [31]:
# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs, use_itm_head=False,return_dict=True)
    proj_text_embedding = normalize(model.text_proj(outputs.question_embeds[:,0,:]))
    proj_image_embedding = normalize(model.vision_proj(outputs.last_hidden_state[:,0,:]))

print(f"Image projection shape without projection head: {outputs.last_hidden_state.shape}")
print(f"Text projection shape without projection head: {outputs.question_embeds.shape}")

print(f"Image projection shape: {proj_image_embedding.shape}")
print(f"Text projection shape: {proj_text_embedding.shape}")

Image projection shape without projection head: torch.Size([1, 577, 768])
Text projection shape without projection head: torch.Size([1, 18, 768])
Image projection shape: torch.Size([1, 256])
Text projection shape: torch.Size([1, 256])


In [32]:
print(caption)

A man with a red helmet on a small moped on a dirt road. 


In [33]:
# Compute pairwise cosine similarity (matrix)
pairwise_cosine_sim = torch.matmul(proj_image_embedding, proj_text_embedding.T)

print(f"Pairwise cosine similarity:\n{pairwise_cosine_sim}")

Pairwise cosine similarity:
tensor([[0.4718]])


In [34]:
pairwise_cosine_sim = torch.matmul(outputs.question_embeds[:,0,:], outputs.last_hidden_state[:,0,:].T)

print(f"Pairwise cosine similarity:\n{pairwise_cosine_sim}")

Pairwise cosine similarity:
tensor([[-0.0889]])


In [35]:
import torch
print(torch.backends.mps.is_available())  # Should print True


True
