<a href="https://colab.research.google.com/github/gantenandini/projects/blob/main/Nandini_clip_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch torchvision

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_

In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:

dataset = load_dataset('zmao/food_img_caption_small', split='train')


processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


def preprocess_data(examples):
    inputs = processor(text=examples['text'], images=examples['image'], return_tensors="pt", padding=True, truncation=True)
    return inputs

dataset = dataset.map(preprocess_data, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'pixel_values'])


train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
model.train()


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
import torch.nn.functional as F

num_epochs = 7
for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)

        # Get image and text logits
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text

        # Create labels (positive pairs are aligned along the diagonal)
        labels = torch.arange(logits_per_image.size(0), device=device)

        # Compute contrastive loss
        loss_image = F.cross_entropy(logits_per_image, labels)
        loss_text = F.cross_entropy(logits_per_text, labels)
        loss = (loss_image + loss_text) / 2

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update the progress bar
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())


Epoch 1: 100%|██████████| 51/51 [00:21<00:00,  2.40it/s, loss=0.0482]
Epoch 2: 100%|██████████| 51/51 [00:20<00:00,  2.44it/s, loss=0.133]
Epoch 3: 100%|██████████| 51/51 [00:21<00:00,  2.41it/s, loss=0.241]
Epoch 4: 100%|██████████| 51/51 [00:21<00:00,  2.34it/s, loss=0.0102]
Epoch 5: 100%|██████████| 51/51 [00:20<00:00,  2.49it/s, loss=0.0721]
Epoch 6: 100%|██████████| 51/51 [00:21<00:00,  2.39it/s, loss=0.0134]
Epoch 7: 100%|██████████| 51/51 [00:21<00:00,  2.39it/s, loss=0.0245]


In [None]:
def evaluate_model(data_loader):
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
            logits_per_image = outputs.logits_per_image

            # Create labels (positive pairs are along the diagonal)
            labels = torch.arange(logits_per_image.size(0), device=device)

            # Get the predicted indices
            preds = logits_per_image.argmax(dim=1)

            # Compare predictions to labels
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy


In [None]:
# Example: Evaluate on the training dataset
accuracy = evaluate_model(train_loader)
print(f"Training Accuracy: {accuracy * 100:.2f}%")


Training Accuracy: 96.06%


In [None]:
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

# Load the local image
image = Image.open("/content/teaching")
# Text descriptions
descriptions = [
    "A boy is playing with ball",
    "A plate of spaghetti",
    "A teacher is teaching students",
    "A boy is eating fruits and vegetables"
]

# Load the CLIP processor and model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)

# Preprocess text and image
inputs = processor(text=descriptions, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Model inference
with torch.no_grad():
    outputs = model(**inputs)

# Get logits for text
logits_per_text = outputs.logits_per_text
probs = logits_per_text.softmax(dim=0)

# Print probabilities for each description
for i, prob in enumerate(probs):
    print(f"Description {i+1}: {descriptions[i]} - Probability: {prob.item()*100:.2f}%")


Description 1: A boy is playing with ball - Probability: 0.00%
Description 2: A plate of spaghetti - Probability: 0.00%
Description 3: A teacher is teaching students - Probability: 99.99%
Description 4: A boy is eating fruits and vegetables - Probability: 0.00%


In [None]:
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

# Load the local image
image = Image.open("/content/boy-ball.jpg")
# Text descriptions
descriptions = [
    "A boy is playing with ball",
    "A plate of spaghetti",
    "A teacher is teaching students",
    "A boy is eating fruits and vegetables"
]

# Load the CLIP processor and model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)

# Preprocess text and image
inputs = processor(text=descriptions, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Model inference
with torch.no_grad():
    outputs = model(**inputs)

# Get logits for text
logits_per_text = outputs.logits_per_text
probs = logits_per_text.softmax(dim=0)

# Print probabilities for each description
for i, prob in enumerate(probs):
    print(f"Description {i+1}: {descriptions[i]} - Probability: {prob.item()*100:.2f}%")


Description 1: A boy is playing with ball - Probability: 99.89%
Description 2: A plate of spaghetti - Probability: 0.00%
Description 3: A teacher is teaching students - Probability: 0.02%
Description 4: A boy is eating fruits and vegetables - Probability: 0.09%


In [None]:
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

# Load the local image
image = Image.open("/content/fruits.jpg")
# Text descriptions
descriptions = [
    "A boy is playing with ball",
    "A plate of spaghetti",
    "The picture contains various vegetables like carrots, potatos",
    "the picture contains only vegetabels",
    "the picture contains different types of fruits"
]

# Load the CLIP processor and model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)

# Preprocess text and image
inputs = processor(text=descriptions, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Model inference
with torch.no_grad():
    outputs = model(**inputs)

# Get logits for text
logits_per_text = outputs.logits_per_text
probs = logits_per_text.softmax(dim=0)

# Print probabilities for each description
for i, prob in enumerate(probs):
    print(f"Description {i+1}: {descriptions[i]} - Probability: {prob.item()*100:.2f}%")


Description 1: A boy is playing with ball - Probability: 0.00%
Description 2: A plate of spaghetti - Probability: 0.00%
Description 3: The picture contains various vegetables like carrots, potatos - Probability: 1.89%
Description 4: the picture contains only vegetabels - Probability: 0.17%
Description 5: the picture contains different types of fruits - Probability: 97.94%
