In [1]:
!pip install open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-3.1.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting timm>=1.0.17 (from open_clip_torch)
  Downloading timm-1.0.19-py3-none-any.whl.metadata (60 kB)
Downloading open_clip_torch-3.1.0-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 16.4 MB/s eta 0:00:00
Downloading timm-1.0.19-py3-none-any.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---------------------------------------- 2.5/2.5 MB 14.3 MB/s eta 0:00:00
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
Installing collected packages: ftfy, timm, open_clip_torch

   ------------- -------------------------- 1/3 [timm]
   ------------- -------------------------- 1/3 [timm]
   ------------- -------------------------- 1/3 [timm]
   ------------- -----

In [None]:
# pip install -U torch torchvision timm open_clip_torch

import torch, open_clip
from torchvision import datasets
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

# === Choose a model from recent CLIP-family work ===
# Classic strong baseline:
MODEL_NAME   = "ViT-B-32"
PRETRAINED   = "laion2b_s34b_b79k"   # from OpenCLIP

# Tip: try very recent ones too (if available in your env):
# MODEL_NAME, PRETRAINED = "ViT-SO400M-14-SigLIP", "webli"        # SigLIP family
# MODEL_NAME, PRETRAINED = "EVA02-L-14", "laion2b_s9b_b144k"      # EVA-CLIP family
# (List available combos:)
# import pprint; pprint.pp(open_clip.list_pretrained())

# --- Load model + preprocess ---
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
tokenizer = open_clip.get_tokenizer(MODEL_NAME)
model = model.to(device).eval()

# --- Zero-shot on CIFAR-10 (tiny & quick) ---
cifar = datasets.CIFAR10(root="./data", train=False, download=True, transform=preprocess)
loader = DataLoader(cifar, batch_size=128, shuffle=False, num_workers=2)

classnames = cifar.classes

prompts = [f"a photo of a {c}" for c in classnames]
with torch.no_grad():
    text_tokens   = tokenizer(prompts).to(device)
    text_features = model.encode_text(text_tokens)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

correct = total = 0
with torch.no_grad():
    for images, labels in loader:
        images = images.to(device)
        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        # CLIP-style scaled cosine sims
        logits = 100.0 * image_features @ text_features.T
        preds = logits.argmax(dim=-1).cpu()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Zero-shot CIFAR-10 accuracy: {100*correct/total:.2f}%")

open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

100%|██████████| 170M/170M [00:26<00:00, 6.38MB/s] 


Zero-shot CIFAR-10 accuracy: 93.66%
