
###([Mastering the Huggingface CLIP Model: How to Extract Embeddings and Calculate Similarity for Text and Images](https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images/))

In [None]:
from datasets import load_from_disk
import torch
from transformers import AutoTokenizer, CLIPModel, AutoProcessor
from PIL import Image
from tqdm import tqdm
import numpy as np
from sklearn.cluster import KMeans
import os

In [2]:
HF_DATASET_DIR = "/workspaces/ASE-Model-Retrieval/data/imagenet/.cache/hf_datasets"
BATCH_SIZE = 128
N_CLUSTERS = 1

In [3]:
hf_dataset_dict = load_from_disk(HF_DATASET_DIR)
hf_dataset_dict

DatasetDict({
    subclass_EntleBucher-German_shepherd-bluetick-croquet_ball-tench: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
    subclass_Border_terrier-Lakeland_terrier-Lhasa-Norfolk_terrier-Sussex_spaniel: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
    subclass_Border_collie-Pomeranian-cairn-kelpie-terrapin: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
    subclass_English_foxhound-Scottie-baseball-giant_schnauzer-minibus: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
    subclass_Norwich_terrier-banana-bloodhound-kite-minivan: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
    subclass_Norwich_terrier-Scottie-dingo-minivan-red_wolf: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
    subclass_Bouvier-Hungarian_pointer-jackfruit-leopard-miniature_poodle: Dataset({
        features: ['image', 'la

In [4]:
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [18]:
def get_text_features(labels):
    text_inputs = tokenizer(
        [f"a photo of a {label}" for label in labels], padding=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        text_features = model.get_text_features(**text_inputs)
    return text_features
    # return text_features.cpu().numpy()

def get_image_features(images):
    images = [Image.open(img_path).convert("RGB") for img_path in images]
    inputs = processor(images=images, return_tensors="pt").to(device)

    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    return image_features
    # return image_features.cpu().numpy()

In [19]:
def compute_task_meta_features(features):
    if features.is_cuda:
        features = features.cpu().numpy()

    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
    kmeans.fit(features)
    # return kmeans.cluster_centers_[0]
    return torch.tensor(kmeans.cluster_centers_[0], device=device)

In [22]:
def process_dataset(dataset_name, dataset):
    print(f"Processing dataset: {dataset_name}")

    unique_labels = list(set(dataset["label"]))
    print(f"Extracting text features for {dataset_name}")
    text_features = get_text_features(unique_labels)
    # print(text_features.shape)

    images = dataset["image"]
    # labels = dataset["label"]

    image_features_list = []
    for i in tqdm(range(0, len(images), BATCH_SIZE), desc=f"Extracting image features for {dataset_name}"):
        batch_images = images[i:i + BATCH_SIZE]
        batch_features = get_image_features(batch_images)
        image_features_list.append(batch_features)

    # print(image_features_list)
    # image_features = np.vstack(image_features_list)
    image_features = torch.cat(image_features_list, dim=0)
    # print(image_features.shape)
    task_meta_features = compute_task_meta_features(image_features)
    # print(f"Task meta feature shape: {task_meta_features.shape}")

    # final_features = np.concatenate((task_meta_features, text_features.mean(axis=0)), axis=0)
    final_features = torch.cat((task_meta_features, text_features.mean(dim=0)), dim=0)
    # output_path = os.path.join(OUTPUT_DIR, f"{dataset_name}_task_embedding.pt")
    # torch.save(final_features, output_path)

for dataset_name, dataset in tqdm(hf_dataset_dict.items(), desc="Processing datasets", unit="dataset"):
    process_dataset(dataset_name, dataset)

Processing datasets:   0%|          | 0/312 [00:00<?, ?dataset/s]

Processing dataset: subclass_EntleBucher-German_shepherd-bluetick-croquet_ball-tench
Extracting text features for subclass_EntleBucher-German_shepherd-bluetick-croquet_ball-tench


Extracting image features for subclass_EntleBucher-German_shepherd-bluetick-croquet_ball-tench: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
Processing datasets:   0%|          | 1/312 [00:04<22:12,  4.29s/dataset]

Processing dataset: subclass_Border_terrier-Lakeland_terrier-Lhasa-Norfolk_terrier-Sussex_spaniel
Extracting text features for subclass_Border_terrier-Lakeland_terrier-Lhasa-Norfolk_terrier-Sussex_spaniel


Extracting image features for subclass_Border_terrier-Lakeland_terrier-Lhasa-Norfolk_terrier-Sussex_spaniel: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Processing datasets:   1%|          | 2/312 [00:08<21:48,  4.22s/dataset]

Processing dataset: subclass_Border_collie-Pomeranian-cairn-kelpie-terrapin
Extracting text features for subclass_Border_collie-Pomeranian-cairn-kelpie-terrapin


Extracting image features for subclass_Border_collie-Pomeranian-cairn-kelpie-terrapin: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
Processing datasets:   1%|          | 3/312 [00:12<21:43,  4.22s/dataset]

Processing dataset: subclass_English_foxhound-Scottie-baseball-giant_schnauzer-minibus
Extracting text features for subclass_English_foxhound-Scottie-baseball-giant_schnauzer-minibus


Extracting image features for subclass_English_foxhound-Scottie-baseball-giant_schnauzer-minibus:   0%|          | 0/2 [00:00<?, ?it/s]
Processing datasets:   1%|          | 3/312 [00:13<22:24,  4.35s/dataset]


KeyboardInterrupt: 