In [16]:
import os
import glob
import numpy as np
import pandas as pd
# from PIL import Image

from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset, load_dataset, Image
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [19]:
SEM_VOC_ROOT = "assets/sem-viz_obj_cat/"
sem_voc_manifest = pd.read_csv(SEM_VOC_ROOT + "manifest.csv")
sem_voc_manifest["image"] = [SEM_VOC_ROOT+i for i in sem_voc_manifest["image"]]
sem_voc_ds = Dataset.from_pandas(sem_voc_manifest).cast_column("image", Image())

In [69]:
def image_collator(data):
    return {k: [ex[k] for ex in data] for k in data[0]}

In [70]:
sem_voc_dl = DataLoader(sem_voc_ds, batch_size=16, collate_fn=image_collator)

In [79]:
def get_image_feats(dataloader, processor, model):
    all_feats = []
    for d in dataloader:
        inputs = processor(images=d["image"], return_tensors="pt")
        feats = model.get_image_features(**inputs).detach().numpy()
        all_feats.append(feats)
    return np.concatenate(all_feats, axis=0)

In [80]:
feats = get_image_feats(sem_voc_dl, processor, model)

In [95]:
# FIXME: make robust based on manifest instead of hand coding

NUM_CATS = 8
NUM_IMGS = 9

feats_mean = np.array([np.mean(feats[c*NUM_IMGS:((c+1)*NUM_IMGS)-1], axis=0) for c in range(NUM_CATS)])

In [96]:
img_sims = cosine_similarity(feats_mean)

In [97]:
img_sims

array([[0.99999994, 0.72129065, 0.74828994, 0.7112463 , 0.6963773 ,
        0.73268056, 0.7028516 , 0.70743144],
       [0.72129065, 0.9999998 , 0.8280212 , 0.828451  , 0.7770638 ,
        0.81577206, 0.8138706 , 0.8407951 ],
       [0.74828994, 0.8280212 , 1.0000001 , 0.9278395 , 0.7848074 ,
        0.8789254 , 0.7716011 , 0.81575674],
       [0.7112463 , 0.828451  , 0.9278395 , 1.0000001 , 0.79224056,
        0.87045276, 0.8131415 , 0.848372  ],
       [0.6963773 , 0.7770638 , 0.7848074 , 0.79224056, 0.9999996 ,
        0.8426869 , 0.84862626, 0.81467444],
       [0.73268056, 0.81577206, 0.8789254 , 0.87045276, 0.8426869 ,
        1.0000006 , 0.82269806, 0.87957287],
       [0.7028516 , 0.8138706 , 0.7716011 , 0.8131415 , 0.84862626,
        0.82269806, 0.9999995 , 0.89862996],
       [0.70743144, 0.8407951 , 0.81575674, 0.848372  , 0.81467444,
        0.87957287, 0.89862996, 0.9999989 ]], dtype=float32)

In [98]:
np.save("evals/sem-viz_obj_cat/clip.npy", img_sims)

In [90]:
SEM_THINGS_ROOT = "assets/sem-things/"
sem_things_manifest = pd.read_csv(SEM_THINGS_ROOT + "manifest.csv")
sem_things_manifest["image"] = [SEM_THINGS_ROOT+i for i in sem_things_manifest["image"]]
sem_things_ds = Dataset.from_pandas(sem_things_manifest).cast_column("image", Image())

In [91]:
sem_things_dl = DataLoader(sem_things_ds, batch_size=16, collate_fn=image_collator)

In [92]:
things_feats = get_image_feats(sem_things_dl, processor, model)

In [99]:
things_sims = cosine_similarity(things_feats)

In [102]:
os.makedirs("evals/sem-things", exist_ok=True)

In [103]:
np.save("evals/sem-things/clip.npy", things_sims)