In [25]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-bprg1fab
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-bprg1fab
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [26]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import clip
import numpy as np

In [27]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
model.to(device)

cuda


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [29]:
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=preprocess)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=preprocess)

train_loader = DataLoader(train_dataset, batch_size = 256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 256, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [33]:
#Extract CIFAR100 classes
class_names = train_dataset.classes
print(class_names)

['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel', 'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock', 'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur', 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion', 'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', 'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree',

In [31]:
##### CLIP Zero-Shot Classification for TRAIN Data of CIFAR100 #####
correct_pred_train = 0
total_pred_train = 0
tokenized_text_desp = clip.tokenize(class_names).to(device)
for image, labels in train_loader:
    images_input = torch.tensor(np.stack(image)).to(device)
    labels = labels.numpy()

    with torch.no_grad():
        image_features = model.encode_image(images_input).float()
        text_features = model.encode_text(tokenized_text_desp).float()

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = text_features @ image_features.T
    preds = np.argmax(similarity.cpu().numpy(), axis=0)
    correct_pred_train += (preds == labels).sum()
    total_pred_train += len(labels)
print(f'Train Accuracy: {100*correct_pred_train / total_pred_train:.4f}')

Train Accuracy: 55.3420


In [32]:
##### CLIP Zero-Shot Classification for TEST Data of CIFAR100 #####
correct_pred = 0
total_pred = 0
tokenized_text_desp = clip.tokenize(class_names).to(device)
for image, labels in test_loader:
    images_input = torch.tensor(np.stack(image)).to(device)
    labels = labels.numpy()
    with torch.no_grad():
        image_features = model.encode_image(images_input).float()
        text_features = model.encode_text(tokenized_text_desp).float()

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = text_features @ image_features.T
    preds = np.argmax(similarity.cpu().numpy(), axis=0)
    correct_pred += (preds == labels).sum()
    total_pred += len(labels)
print(f'Test Accuracy: {100*correct_pred / total_pred:.4f}')

Test Accuracy: 55.2700
