In [1]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
import json

dense_embs = load_dataset("lsr42/mscoco-blip-dense", data_files={"img_emb": "img_embs.parquet", "text_emb": "text_embs.parquet"}, keep_in_memory=True).with_format("numpy")
meta_data = json.load(open(hf_hub_download(
    repo_id="lsr42/mscoco-blip-dense", repo_type="dataset", filename="dataset_meta.json")))

In [20]:
dense_embs.keys()

dict_keys(['img_emb', 'text_emb'])

In [2]:
dense_embs['img_emb']['emb'].shape

(123287, 256)

In [3]:
dense_embs['text_emb']['emb'].shape

(616767, 256)

In [4]:
dense_embs['img_emb']

Dataset({
    features: ['id', 'emb'],
    num_rows: 123287
})

In [5]:
meta_data.keys()

dict_keys(['images', 'dataset'])

In [6]:
meta_data['dataset']


'coco'

In [7]:
meta_data['images'][0].keys()

dict_keys(['filepath', 'sentids', 'filename', 'imgid', 'split', 'sentences', 'cocoid'])

In [40]:
meta_data['images'][1]

{'filepath': 'val2014',
 'sentids': [681330, 686718, 688839, 693159, 693204],
 'filename': 'COCO_val2014_000000522418.jpg',
 'imgid': 1,
 'split': 'restval',
 'sentences': [{'tokens': ['a',
    'woman',
    'wearing',
    'a',
    'net',
    'on',
    'her',
    'head',
    'cutting',
    'a',
    'cake'],
   'raw': 'A woman wearing a net on her head cutting a cake. ',
   'imgid': 1,
   'sentid': 681330},
  {'tokens': ['a', 'woman', 'cutting', 'a', 'large', 'white', 'sheet', 'cake'],
   'raw': 'A woman cutting a large white sheet cake.',
   'imgid': 1,
   'sentid': 686718},
  {'tokens': ['a',
    'woman',
    'wearing',
    'a',
    'hair',
    'net',
    'cutting',
    'a',
    'large',
    'sheet',
    'cake'],
   'raw': 'A woman wearing a hair net cutting a large sheet cake.',
   'imgid': 1,
   'sentid': 688839},
  {'tokens': ['there',
    'is',
    'a',
    'woman',
    'that',
    'is',
    'cutting',
    'a',
    'white',
    'cake'],
   'raw': 'there is a woman that is cutting a

In [None]:
# Preprocess text embeddings for fast lookup, normalizing keys to strings
text_emb_lookup = {str(item['id']): item['emb'] for item in dense_embs['text_emb']}  # Ensure all keys are strings




Image embedding shape: (256,)
Number of text embeddings found: 5
Text embedding shapes: [(256,), (256,), (256,), (256,), (256,)]


In [48]:
# Retrieve image embedding
imgid = meta_data['images'][0]['imgid']
example_img_embed = dense_embs['img_emb'][imgid]['emb']

# Retrieve text embeddings for the corresponding sentence IDs
sentids = meta_data['images'][0]['sentids']
text_embeddings = [text_emb_lookup[str(sentid)] for sentid in sentids if str(sentid) in text_emb_lookup]

# Output
print(f"Image embedding shape: {example_img_embed.shape}")
print(f"Number of text embeddings found: {len(text_embeddings)}")
print(f"Text embedding shapes: {[emb.shape for emb in text_embeddings]}")

Image embedding shape: (256,)
Number of text embeddings found: 5
Text embedding shapes: [(256,), (256,), (256,), (256,), (256,)]


In [49]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
"""
for i in range(len(text_embeddings)):
    cosine_sim_matrix = cosine_similarity(example_img_embed.reshape(-1,1), text_embeddings[i].reshape(-1,1))
    print(f"Text embedding {i} cosine similarity: {cosine_sim_matrix}")

"""
for i in range(len(text_embeddings)):
    pairwise_cosine_sim = np.matmul(example_img_embed, text_embeddings[i].T)
    print(f"Text embedding {i} cosine similarity: {pairwise_cosine_sim}")





Text embedding 0 cosine similarity: 0.43064552545547485
Text embedding 1 cosine similarity: 0.42722970247268677
Text embedding 2 cosine similarity: 0.33973366022109985
Text embedding 3 cosine similarity: 0.4240642786026001
Text embedding 4 cosine similarity: 0.4385703504085541


In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
from glob import glob
import json

# Define paths
path_to_dataset_folder = '/Users/doruktarhan/Desktop/MSCOCO_Dataset' #dataset images folder path
# Use glob to find all image files across subdirectories
image_files = glob(f'{path_to_dataset_folder}/**/*.jpg', recursive=True)
all_image_paths = {os.path.basename(file): file for file in image_files}




# Load metadata
meta_data_path = 'data/dataset_coco.json' #metadata file path
with open(meta_data_path, 'r') as f:
    meta_data_kaggle = json.load(f)

# Display images and metadata for the first 5 instances
for idx, metadata_item in enumerate(meta_data_kaggle['images'][:5]):
    filename = metadata_item["filename"]  
    captions = [sentence["raw"] for sentence in metadata_item["sentences"]]  # Captions list
    
    # Use the glob-based lookup dictionary to find the image path
    image_path = all_image_paths.get(filename)
    if not image_path:
        print(f"Image not found: {filename}")
        continue

    # Load the image
    image = Image.open(image_path)

    # Display the metadata and image
    print(f"Metadata for instance {idx + 1}:")
    print(f"Filename: {filename}")
    print(f"Captions: {captions}")

    # Plot the image and captions
    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.axis('off')
    plt.title(f"Captions: {captions[0]}")
    plt.show()


In [None]:
meta_data_kaggle['images'][0].keys()

In [None]:
import pandas as pd

# Convert metadata to a pandas DataFrame
metadata_images = meta_data_kaggle['images']  # Extract the 'images' key
df = pd.DataFrame(metadata_images)

# Display the first few rows of the DataFrame
print(df.head())

# Check unique splits and their counts
split_counts = df['split'].value_counts()

print("\nUnique Splits:")
print(split_counts)


### BLIP Encoder Representations

In [None]:
from transformers import BlipProcessor, BlipForImageTextRetrieval
from PIL import Image
import torch
import os
import json


# Load the BLIP model and processor
model_name = "Salesforce/blip-itm-large-coco" #model card for image text matching
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForImageTextRetrieval.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


#load metadata
meta_data_path = 'data/dataset_coco.json'
with open(meta_data_path, 'r') as f:
    meta_data_kaggle = json.load(f)


#image path 
path_to_dataset_folder = '/Users/doruktarhan/Desktop/MSCOCO_Dataset' #dataset images folder path




In [None]:
from PIL import Image
import torch

# Define paths
path_to_dataset_folder = '/Users/doruktarhan/Desktop/MSCOCO_Dataset' #dataset images folder path
# Use glob to find all image files across subdirectories
image_files = glob(f'{path_to_dataset_folder}/**/*.jpg', recursive=True)
all_image_paths = {os.path.basename(file): file for file in image_files}




# Load metadata
meta_data_path = 'data/dataset_coco.json' #metadata file path
with open(meta_data_path, 'r') as f:
    meta_data_kaggle = json.load(f)

# Display images and metadata for the first 5 instances
for idx, metadata_item in enumerate(meta_data_kaggle['images'][:1]):
    filename = metadata_item["filename"]  # e.g., '1000092795.jpg'
    captions = [sentence["raw"] for sentence in metadata_item["sentences"]]  # Captions list
    
    # Use the glob-based lookup dictionary to find the image path
    image_path = all_image_paths.get(filename)
    if not image_path:
        print(f"Image not found: {filename}")
        continue

    # Load the image
    image = Image.open(image_path)

    # Display the metadata and image
    print(f"Metadata for instance {idx + 1}:")
    print(f"Filename: {filename}")
    print(f"Captions: {captions}")

    # Plot the image and captions
    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.axis('off')
    plt.title(f"Captions: {captions[0]}")
    plt.show()



In [None]:
from torch.nn.functional import softmax

for caption in captions:
    inputs = processor(images=image, text=caption, return_tensors="pt")

    # Perform inference
    outputs = model(**inputs)
    print(f'Caption: {caption}')
    print(outputs['itm_score'])
    probabilities = softmax(outputs['itm_score'], dim=1)
    print(probabilities)
    print(outputs['question_embeds'].shape)
    print("\n")



In [None]:
outputs.keys()

In [None]:
caption = 'Man riding on a motorcycle with blue helmet'
inputs = processor(images=image, text=caption, return_tensors="pt")

# Perform inference
outputs = model(**inputs)
print(f'Caption: {caption}')
print(f"Output tensor itm matching score: {outputs['itm_score']}")
probabilities = softmax(outputs['itm_score'], dim=1)
print(f"Probabilities after softmax: {probabilities}")

print(f"Question embedding shape: {outputs['question_embeds'].shape}")
print(f"Image embedding shape: {outputs['last_hidden_state'].shape}")

print("\n")

In [None]:
from torch.nn.functional import normalize

proj_text_embedding = normalize(model.text_proj(outputs.question_embeds[:,0,:]))
proj_image_embedding = normalize(model.vision_proj(outputs.last_hidden_state[:,0,:]))


print(f"Image projection shape: {proj_image_embedding.shape}")
print(f"Text projection shape: {proj_text_embedding.shape}")

# Compute pairwise cosine similarity (matrix)
pairwise_cosine_sim = torch.matmul(proj_image_embedding, proj_text_embedding.T)

print(f"Pairwise cosine similarity:\n{pairwise_cosine_sim}")

In [3]:
import torch

# Load the saved embeddings
file_path = "test_embeddings/blip_test/test_batch.pt"
embeddings = torch.load(file_path)

# Extract the embeddings
image_embeds = embeddings["image_embeds"]  # (batch_size, embedding_dim)
text_embeds = embeddings["text_embeds"]    # (batch_size, embedding_dim)

# Normalize the embeddings
image_embeds = torch.nn.functional.normalize(image_embeds, p=2, dim=1)  # Normalize along embedding dimension
text_embeds = torch.nn.functional.normalize(text_embeds, p=2, dim=1)

# Compute pairwise cosine similarity for each image-text pair
cosine_similarities = torch.sum(image_embeds * text_embeds, dim=1)

# Print the results
print("Cosine similarities for each image-text pair:")
for i, sim in enumerate(cosine_similarities):
    print(f"Pair {i+1}: {sim.item()}")


Cosine similarities for each image-text pair:
Pair 1: 0.4717683494091034
Pair 2: 0.4680973291397095
Pair 3: 0.4207531213760376
Pair 4: 0.5063284635543823
Pair 5: 0.4703652560710907
Pair 6: 0.4047545790672302
Pair 7: 0.46468424797058105
Pair 8: 0.46485379338264465
Pair 9: 0.4473761022090912
Pair 10: 0.5034165978431702
Pair 11: 0.42352989315986633
Pair 12: 0.5207198262214661
Pair 13: 0.4432687759399414
Pair 14: 0.44502633810043335
Pair 15: 0.5233748555183411
Pair 16: 0.44414687156677246


  embeddings = torch.load(file_path)


In [4]:
import pandas as pd
import numpy as np
import torch

# Paths to the Parquet files
img_embs_path = "blip_trial_embeds/img_embs.parquet"
text_embs_path = "blip_trial_embeds/text_embs.parquet"

# Load Parquet files
img_embs_df = pd.read_parquet(img_embs_path)
text_embs_df = pd.read_parquet(text_embs_path)

# Ensure one-to-one correspondence
assert len(img_embs_df) == len(text_embs_df), "Mismatch in image and text embeddings!"

# Extract embeddings and convert them to tensors
img_embs = torch.tensor(np.vstack(img_embs_df['embedding'].values))
text_embs = torch.tensor(np.vstack(text_embs_df['embedding'].values))

# Normalize embeddings for cosine similarity (optional, depends on the model output)
img_embs_normalized = torch.nn.functional.normalize(img_embs, dim=1)
text_embs_normalized = torch.nn.functional.normalize(text_embs, dim=1)

# Compute pairwise cosine similarity using matrix multiplication
cosine_similarities = torch.matmul(img_embs_normalized, text_embs_normalized.T)

# Extract diagonal elements for one-to-one correspondence cosine similarities
one_to_one_cosine_similarities = cosine_similarities.diag()

# Print the cosine similarities
for idx, sim in enumerate(one_to_one_cosine_similarities):
    print(f"Image {idx} and Text {idx} Cosine Similarity: {sim.item()}")


Image 0 and Text 0 Cosine Similarity: 0.4717683795442903
Image 1 and Text 1 Cosine Similarity: 0.4680973760820966
Image 2 and Text 2 Cosine Similarity: 0.42075334066719833
Image 3 and Text 3 Cosine Similarity: 0.5063282873348682
Image 4 and Text 4 Cosine Similarity: 0.47036502239183337
Image 5 and Text 5 Cosine Similarity: 0.4047548874558168
Image 6 and Text 6 Cosine Similarity: 0.46468458497358284
Image 7 and Text 7 Cosine Similarity: 0.4648540968885047
Image 8 and Text 8 Cosine Similarity: 0.44737641313792575
Image 9 and Text 9 Cosine Similarity: 0.5034165912001807
Image 10 and Text 10 Cosine Similarity: 0.42353017969376117
Image 11 and Text 11 Cosine Similarity: 0.5207200707088676
Image 12 and Text 12 Cosine Similarity: 0.44326884858981475
Image 13 and Text 13 Cosine Similarity: 0.4450265627638724
Image 14 and Text 14 Cosine Similarity: 0.5233751381856252
Image 15 and Text 15 Cosine Similarity: 0.44414677803119235
Image 16 and Text 16 Cosine Similarity: 0.4793473543447772
Image 17 a