In [5]:
!pip install git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33 --upgrade --no-deps
!pip install ftfy regex tqdm

Collecting git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33
  Cloning https://github.com/openai/CLIP.git (to revision a1d071733d7111c9c014f024669f959182114e33) to /tmp/pip-req-build-28xtepbm
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-28xtepbm
  Running command git rev-parse -q --verify 'sha^a1d071733d7111c9c014f024669f959182114e33'
  Running command git fetch -q https://github.com/openai/CLIP.git a1d071733d7111c9c014f024669f959182114e33
  Running command git checkout -q a1d071733d7111c9c014f024669f959182114e33
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00

In [6]:
import clip
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:08<00:00, 41.4MiB/s]


In [8]:
transform = preprocess

test_dataset = datasets.Food101(
    root="./data", split="test", download=True, transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

class_names = test_dataset.classes  # the name of 101 dishes

100%|██████████| 5.00G/5.00G [00:37<00:00, 135MB/s]


In [9]:
# encode text queries
with torch.no_grad():
    text_tokens = clip.tokenize(class_names).to(device)
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

In [10]:
# encode image features
all_image_features = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        features = model.encode_image(images)
        features /= features.norm(dim=-1, keepdim=True)
        all_image_features.append(features.cpu())
        all_labels.append(labels)

all_image_features = torch.cat(all_image_features, dim=0)  # [N, d]
all_labels = torch.cat(all_labels, dim=0)

similarity[i, j] = $∑_{k=1}^d$ text_features[i, k] . all_image_features[j, k]

In [19]:
# compute similarity
similarity = text_features.cpu() @ all_image_features.T

In [12]:
# calculate Recall@K
def recall_at_k(similarity, labels, k=1):
    num_queries = similarity.shape[0]
    correct = 0
    for i in range(num_queries):
        # Query text = class i
        sims = similarity[i]  # similarity scores with all images
        topk_indices = sims.topk(k).indices.numpy()
        # Check if any retrieved image has label = i
        if any(labels[idx].item() == i for idx in topk_indices):
            correct += 1
    return correct / num_queries

In [13]:
for k in [1, 5, 10]:
    r_at_k = recall_at_k(similarity, all_labels, k=k)
    print(f"Recall@{k}: {r_at_k:.4f}")

Recall@1: 0.9703
Recall@5: 1.0000
Recall@10: 1.0000


### **Compute nDCG (Normalized Discounted Cumulative Gain)**


DCG@K = $\sum_{i=1}^{K} \frac{rel_i}{\log_2(i+1)}$

Trong đó:
- $rel_i$ = độ liên quan (relevance)
- $i$ = vị trí của kết quả trong top-K  
- IDCG@K = DCG@K tốt nhất có thể đạt được (khi tất cả relevant items nằm trên đầu danh sách).
- nDCG@K:

    $nDCG@K$ = $\frac{DCG@K}{IDCG@K}$

In [34]:
def dcg_score(relevances, k):
    """Compute DCG at rank k"""
    relevances = np.asarray(relevances)[:k]
    return np.sum((2**relevances - 1) / np.log2(np.arange(2, relevances.size + 2)))


def ndcg_score(y_true, y_scores, k=10):
    """
    Compute nDCG@k for a single query
    y_true: binary relevance vector (1 if relevant, else 0)
    y_scores: predicted scores for each item
    """
    # sort by predicted scores
    order = np.argsort(y_scores)[::-1]
    y_true_sorted = np.take(y_true, order[:k])

    dcg = dcg_score(y_true_sorted, k)
    idcg = dcg_score(sorted(y_true, reverse=True), k)
    return dcg / idcg if idcg > 0 else 0.0

In [42]:
K = 1
n_classes = len(class_names)
all_ndcgs = []
# similarity = similarity.numpy()

for K in [1, 5, 10]:
    for class_idx in range(n_classes):
        scores = similarity[class_idx]  # similarity scores of this class to all images
        labels = (all_labels.numpy() == class_idx).astype(int)  # relevance vector

        ndcg = ndcg_score(labels, scores, k=K)
        all_ndcgs.append(ndcg)

    mean_ndcg = np.mean(all_ndcgs)
    print(f"Mean nDCG@{K}: {mean_ndcg:.4f}")

Mean nDCG@1: 0.9703
Mean nDCG@5: 0.9685
Mean nDCG@10: 0.9668
