In [7]:
import torch
from transformers import CLIPModel, AutoTokenizer, AutoProcessor
import os
from sklearn.cluster import KMeans
from PIL import Image
import numpy as np
from datasets import load_from_disk

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(DEVICE)
TOKENIZER = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")
PROCESSOR = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")

HF_DATASET_DIR = "/workspaces/ASE-Model-Retrieval/data/imagenet/.cache/hf_datasets"
OUTPUT_DIR = "/workspaces/ASE-Model-Retrieval/models/.cache/task_embeddings"

In [12]:
hf_datasets = load_from_disk(HF_DATASET_DIR)
dataset_name = list(hf_datasets.keys())[0]  # Select first dataset
dataset = hf_datasets[dataset_name]
unique_labels = sorted(set(dataset["label"]))
unique_labels

['Appenzeller',
 'Ibizan_hound',
 'Scottish_deerhound',
 'West_Highland_white_terrier',
 'baseball']

In [15]:
text_inputs = TOKENIZER(
    [f"photos of {', '.join(unique_labels[:5])}."], padding=True, return_tensors="pt"
).to(DEVICE)
with torch.no_grad():
    text_features = MODEL.get_text_features(**text_inputs)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_features = text_features.cpu().numpy()[0]
text_features.shape

(768,)

In [19]:
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = PROCESSOR(images=image, return_tensors="pt").to(DEVICE)
    return inputs

image_features_list = []
image_paths = dataset["image"]

for image_path in image_paths:
    image_inputs = preprocess_image(image_path)
    with torch.no_grad():
        image_features = MODEL.get_image_features(**image_inputs)
    image_features /= image_features.norm(dim=-1, keepdim=True)  # Normalize
    image_features_list.append(image_features.cpu().numpy())

image_features = np.vstack(image_features_list)
image_features.shape

(250, 768)

In [20]:
def compute_task_meta_features(features):
    n_splits = 5
    feature_splits = np.array_split(features, n_splits)

    cluster_features = np.concatenate([
        KMeans(n_clusters=1, random_state=42).fit(split).cluster_centers_[0]
        for split in feature_splits
    ])
    return cluster_features

In [21]:
task_meta_features = compute_task_meta_features(image_features)
task_meta_features.shape

(3840,)