<a href="https://colab.research.google.com/github/chandini2595/comprehensive-transfer-learning-experiments/blob/main/Colab/Zero_shot_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Zero-Shot Classification with CLIP

In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

import torch
import clip
from PIL import Image
import requests
from io import BytesIO


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-lf0sf5mf
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-lf0sf5mf
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [2]:
image_url = "https://images.unsplash.com/photo-1593642634367-d91a135587b5"
image = Image.open(BytesIO(requests.get(image_url).content))
image.show()

In [3]:
# Example labels
texts = ["a photo of a dog", "a photo of a cat", "a photo of a laptop", "a photo of a mountain"]


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image_input = preprocess(image).unsqueeze(0).to(device)
text_tokens = clip.tokenize(texts).to(device)


100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 92.1MiB/s]


In [5]:
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_tokens)

    logits_per_image, _ = model(image_input, text_tokens)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", list(zip(texts, probs[0])))


Label probs: [('a photo of a dog', np.float32(4.8781585e-05)), ('a photo of a cat', np.float32(6.233164e-05)), ('a photo of a laptop', np.float32(0.99977034)), ('a photo of a mountain', np.float32(0.00011859393))]


# Transfer Learning using TensorFlow Hub

In [6]:
!pip install -q tensorflow tensorflow_hub tensorflow_datasets

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

In [7]:
batch_size = 32
image_size = (224, 224)

def format_image(image, label):
    image = tf.image.resize(image, image_size) / 255.0
    return image, label

(train_data, val_data), dataset_info = tfds.load(
    'tf_flowers',
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True,
    with_info=True
)

train_batches = train_data.map(format_image).shuffle(1000).batch(batch_size).prefetch(1)
val_batches = val_data.map(format_image).batch(batch_size).prefetch(1)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/tf_flowers/3.0.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/tf_flowers/incomplete.4WMWKP_3.0.1/tf_flowers-train.tfrecord*...:   0%|   …

Dataset tf_flowers downloaded and prepared to /root/tensorflow_datasets/tf_flowers/3.0.1. Subsequent calls will reuse this data.


In [13]:
import tensorflow as tf
import tensorflow_hub as hub

bit_model_url = "https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/5"  # Smaller MobileNetV2
feature_extractor = hub.KerasLayer(bit_model_url, input_shape=image_size + (3,), trainable=False)

# Wrap the KerasLayer in a functional model
# This is done to ensure compatibility with tf.keras.Sequential
def feature_extractor_model(inputs):
    return feature_extractor(inputs)

# Now create the Sequential model
tf_model = tf.keras.Sequential([
    tf.keras.layers.Lambda(feature_extractor_model, input_shape=image_size + (3,)),  # Wrap in Lambda layer
    tf.keras.layers.Dense(5, activation='softmax')  # 5 flower classes
])

tf_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

  super().__init__(**kwargs)


In [14]:
history = tf_model.fit(train_batches, epochs=5, validation_data=val_batches) # Use tf_model instead of model

Epoch 1/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 1s/step - accuracy: 0.5954 - loss: 1.0619 - val_accuracy: 0.8692 - val_loss: 0.4274
Epoch 2/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 1s/step - accuracy: 0.8870 - loss: 0.3658 - val_accuracy: 0.8869 - val_loss: 0.3502
Epoch 3/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 1s/step - accuracy: 0.9066 - loss: 0.3001 - val_accuracy: 0.9046 - val_loss: 0.2990
Epoch 4/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 1s/step - accuracy: 0.9246 - loss: 0.2454 - val_accuracy: 0.9046 - val_loss: 0.2889
Epoch 5/5
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 1s/step - accuracy: 0.9399 - loss: 0.2088 - val_accuracy: 0.9005 - val_loss: 0.2824
