In [1]:
from io import BytesIO

import clip
import numpy as np
import pandas as pd
import requests
import torch
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

# embeddings: https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_0.npy
# parquet: https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_0.parquet


In [2]:
# !aria2c https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_0.npy
# !aria2c https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_0.parquet
# !aria2c https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/text_emb/text_emb_0.npy

In [3]:
img_emb = np.load('multimodal_embeddings/img_emb/img_emb_0.npy')
text_emb = np.load('multimodal_embeddings/text_emb/text_emb_0.npy')
print(img_emb.shape, text_emb.shape)

(1000448, 512) (1000448, 512)


In [4]:
# Elementwise combination for multimodal embeddings
w1 = 0.5
w2 = 0.5

def normalized(a, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)

multimodal_emb = normalized(w1*img_emb + w2*text_emb)

In [5]:
np.save('multimodal_embeddings/combined_emb/combined_emb.npy', multimodal_emb)

In [7]:
## Build sample Index
!autofaiss build_index --embeddings="multimodal_embeddings/combined_emb" \
                    --index_path="combined.index" \
                    # --index_infos_path="infos.json" \
                    --metric_type="ip" \
                    --max_index_query_time_ms=5 
                    # \
                    # --max_index_memory_usage="1GB"

2024-07-08 20:34:56,113 [INFO]: Using 12 omp threads (processes), consider increasing --nb_cores if you have more
2024-07-08 20:34:56,113 [INFO]: Launching the whole pipeline 07/08/2024, 20:34:56
2024-07-08 20:34:56,113 [INFO]: Reading total number of vectors and dimension 07/08/2024, 20:34:56
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 6921.29it/s]
2024-07-08 20:34:56,132 [INFO]: There are 1000448 embeddings of dim 512
2024-07-08 20:34:56,132 [INFO]: >>> Finished "Reading total number of vectors and dimension" in 0.0191 secs
2024-07-08 20:34:56,132 [INFO]: 	Compute estimated construction time of the index 07/08/2024, 20:34:56
2024-07-08 20:34:56,132 [INFO]: 		-> Train: 16.7 minutes
2024-07-08 20:34:56,132 [INFO]: 		-> Add: 5.7 seconds
2024-07-08 20:34:56,133 [INFO]: 		Total: 16.8 minutes
2024-07-08 20:34:56,133 [INFO]: 	>>> Finished "Compute estimated construction time of the index" in 0.0002 secs
2024-07-08 20:34:56,133 [INFO]: 	Checking that your have enough 

In [6]:
import faiss
import numpy as np
# check this for text and do a compare between text and image combined
img_ind = faiss.read_index("img.index")
text_ind = faiss.read_index("text.index")
combined_ind = faiss.read_index("combined.index")

In [None]:
from pathlib import Path
import pandas as pd
data_dir = Path("embeddings")
df = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.parquet')
)
print(df.head(2))
image_list = df["image_path"].tolist()
caption_list = df["caption"].tolist()
url_list = df["url"].tolist()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

Text to Text query

In [None]:
text = "Black kitten walking"
text_tokens = clip.tokenize([text], truncate=True)

text_features = model.encode_text(text_tokens.to(device))
text_features /= text_features.norm(dim=-1, keepdim=True)
text_embeddings = text_features.cpu().detach().numpy().astype('float32')

print("="*10)
print("text query = {}".format(text))

D, I = text_ind.search(text_embeddings, 5)
print("results :")
for d, i in zip(D[0], I[0]):
  print("Similarity= ", d)
  print("Index ={}".format(i))
  print("Caption ={}".format(caption_list[i]))
  print("Image url = {}".format(url_list[i]))
  print(" ")

Text to Image query

In [None]:
text = "Black kitten walking"
text_tokens = clip.tokenize([text], truncate=True)

text_features = model.encode_text(text_tokens.to(device))
text_features /= text_features.norm(dim=-1, keepdim=True)
text_embeddings = text_features.cpu().detach().numpy().astype('float32')

print("="*10)
print("text query = {}".format(text))

D, I = img_ind.search(text_embeddings, 5)
print("results :")
for d, i in zip(D[0], I[0]):
  print("Similarity= ", d)
  print("Index ={}".format(i))
  print("Caption ={}".format(caption_list[i]))
  print("Image url = {}".format(url_list[i]))
  print(" ")

Image to Image query

In [None]:
from PIL import Image

image = Image.open("newcat.jpg")
image_tensor = preprocess(image)

image_features = model.encode_image(torch.unsqueeze(image_tensor.to(device), dim=0))
image_features /= image_features.norm(dim=-1, keepdim=True)

image_embeddings = image_features.cpu().detach().numpy().astype('float32')

print("query :")
display(image) 

D, I = img_ind.search(image_embeddings, 5)
print("results :")
for d, i in zip(D[0], I[0]):
  print("Similarity= ", d)
  print("Index ={}".format(i))
  print("Caption ={}".format(caption_list[i]) )
  print("Image url = {}".format(url_list[i]))
  print(" ")

Image to text query

In [None]:
from PIL import Image

image = Image.open("newcat.jpg")
image_tensor = preprocess(image)

image_features = model.encode_image(torch.unsqueeze(image_tensor.to(device), dim=0))
image_features /= image_features.norm(dim=-1, keepdim=True)

image_embeddings = image_features.cpu().detach().numpy().astype('float32')

print("query :")
display(image) 

D, I = text_ind.search(image_embeddings, 5)
print("results :")
for d, i in zip(D[0], I[0]):
  print("Similarity= ", d)
  print("Index ={}".format(i))
  print("Caption ={}".format(caption_list[i]) )
  print("Image url = {}".format(url_list[i]))
  print(" ")

Multi modal Image-Text query

In [None]:
from PIL import Image

image = Image.open("newcat.jpg")
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
image_tensor = preprocess(image)

image_features = model.encode_image(torch.unsqueeze(image_tensor.to(device), dim=0))
image_features /= image_features.norm(dim=-1, keepdim=True)

image_embeddings = image_features.cpu().detach().numpy().astype('float32')

print("="*10)
print("query :")
display(image) 

text = "black cat with yellow eyes"
text_tokens = clip.tokenize([text], truncate=True)

text_features = model.encode_text(text_tokens.to(device))
text_features /= text_features.norm(dim=-1, keepdim=True)
text_embeddings = text_features.cpu().detach().numpy().astype('float32')

print("="*10)
print("text query = {}".format(text))

combined_embeddings = 0.3 * image_embeddings + 0.7 * text_embeddings

D, I = combined_ind.search(combined_embeddings, 5)
print("results :")
for d, i in zip(D[0], I[0]):
  print("Similarity= ", d)
  print("Index ={}".format(i))
  print("Caption ={}".format(caption_list[i]) )
  print("Image url = {}".format(url_list[i]))
  print(" ")