In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.metrics.pairwise import cosine_similarity
from transformers import CLIPProcessor, CLIPModel

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [17]:
image_files = ["assets/sem-viz_obj_cat/images/" + f for f in sorted(os.listdir("assets/sem-viz_obj_cat/images/"))]

In [22]:
def get_image_feats(image_files, processor, model):
    images = [Image.open(f) for f in image_files]
    inputs = processor(images=images, return_tensors="pt")
    feats = model.get_image_features(**inputs).detach().numpy()
    return feats

In [23]:
feats = get_image_feats(image_files, processor, model)

In [24]:
NUM_CATS = 8
NUM_IMGS = 9

feats_mean = np.array([np.mean(feats[c*NUM_IMGS:((c+1)*NUM_IMGS)-1], axis=0) for c in range(NUM_CATS)])

In [25]:
img_sims = cosine_similarity(feats_mean)

In [26]:
img_sims

array([[0.99999994, 0.72129065, 0.74828994, 0.7112463 , 0.6963773 ,
        0.73268056, 0.7028516 , 0.70743144],
       [0.72129065, 0.9999998 , 0.8280212 , 0.828451  , 0.7770638 ,
        0.81577206, 0.8138706 , 0.8407951 ],
       [0.74828994, 0.8280212 , 1.0000001 , 0.9278395 , 0.7848074 ,
        0.8789254 , 0.7716011 , 0.81575674],
       [0.7112463 , 0.828451  , 0.9278395 , 1.0000001 , 0.79224056,
        0.87045276, 0.8131415 , 0.848372  ],
       [0.6963773 , 0.7770638 , 0.7848074 , 0.79224056, 0.9999996 ,
        0.8426869 , 0.84862626, 0.81467444],
       [0.73268056, 0.81577206, 0.8789254 , 0.87045276, 0.8426869 ,
        1.0000006 , 0.82269806, 0.87957287],
       [0.7028516 , 0.8138706 , 0.7716011 , 0.8131415 , 0.84862626,
        0.82269806, 0.9999995 , 0.89862996],
       [0.70743144, 0.8407951 , 0.81575674, 0.848372  , 0.81467444,
        0.87957287, 0.89862996, 0.9999989 ]], dtype=float32)