In [1]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-jt2o6o_j
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-jt2o6o_j
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.3-py3-none-any.whl.metadata (7.8 kB)
Downloading ftfy-6.2.3-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.0/43.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=f945f627d88412a922140f6cacfcdb1746d7e5f73cb5392eb43ca4b9bf636e33
  Stored in directory: /tmp/pip-ephem-wheel-cache-rs95mmun/wheels/da/2b/4c/d6691fa9597aac8bb

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import clip
import numpy as np

In [3]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
model.to(device)

cuda


100%|███████████████████████████████████████| 338M/338M [00:07<00:00, 45.4MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [6]:
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=preprocess)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=preprocess)

train_loader = DataLoader(train_dataset, batch_size = 256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 256, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:07<00:00, 21824258.60it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [7]:
# Extract CIFAR-10 classes
class_names = train_dataset.classes
print(class_names)

['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']


In [8]:
##### CLIP Zero-Shot Classification for TRAIN Data of CIFAR10 #####
correct_pred_train = 0
total_pred_train = 0
tokenized_text_desp = clip.tokenize(class_names).to(device)
for image, labels in train_loader:
    images_input = torch.tensor(np.stack(image)).to(device)
    labels = labels.numpy()

    with torch.no_grad():
        image_features = model.encode_image(images_input).float()
        text_features = model.encode_text(tokenized_text_desp).float()

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = text_features @ image_features.T
    preds = np.argmax(similarity.cpu().numpy(), axis=0)
    correct_pred_train += (preds == labels).sum()
    total_pred_train += len(labels)
print(f'Train Accuracy: {100*correct_pred_train / total_pred_train:.4f}')

Train Accuracy: 87.5020


In [9]:
##### CLIP Zero-Shot Classification for TEST Data of CIFAR10 #####
correct_pred = 0
total_pred = 0
tokenized_text_desp = clip.tokenize(class_names).to(device)
for image, labels in test_loader:
    images_input = torch.tensor(np.stack(image)).to(device)
    labels = labels.numpy()
    with torch.no_grad():
        image_features = model.encode_image(images_input).float()
        text_features = model.encode_text(tokenized_text_desp).float()

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = text_features @ image_features.T
    preds = np.argmax(similarity.cpu().numpy(), axis=0)
    correct_pred += (preds == labels).sum()
    total_pred += len(labels)
print(f'Test Accuracy: {100*correct_pred / total_pred:.4f}')

Test Accuracy: 87.3800
