## Image Analysis (Vision AI)

### Option 1 - CLIP

In [1]:
from transformers import CLIPProcessor, CLIPModel
import torch
import requests
from PIL import Image
from io import BytesIO
import torch.nn.functional as F

#### Load the Pretrained Model

In [2]:
from transformers import CLIPProcessor, CLIPModel

model_name = "openai/clip-vit-large-patch14-336"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

#### Define Categories (Text Labels)

In [4]:
categories = ["cakes", "pancakes", "glass of beer", "not recognize"]

# Convert text to CLIP embeddings
text_inputs = processor(text=categories, return_tensors="pt", padding=True)
text_features = model.get_text_features(**text_inputs)

# Normalize text features
text_features = F.normalize(text_features, p=2, dim=-1)

#### Load Images from URLs

In [5]:
def load_images_from_urls(image_urls):
    images = []
    for url in image_urls:
        try:
            response = requests.get(url, timeout=5)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert("RGB")
            images.append(img)
        except requests.exceptions.RequestException as e:
            print(f"Error loading {url}: {e}")
    return images


In [6]:
image_urls =  [
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/16/a3/95/6d/photo1jpg.jpg",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/0f/31/fd/bb/photo0jpg.jpg",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/07/eb/18/fc/gateaux-bakery.jpg",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2d/12/a0/b8/caption.jpg",
    "https://lh3.googleusercontent.com/geougc-cs/AIHozJKWOKOWjvG3ZJvXhnZXCdPWx1v92oJfxR5nn3eoNY4nRSnWlvzo6rqWYQ8yBbibRYLR8nZJl_IOZK_khqH08cZtLjksyfcRkspU4P7uZDIl-6_RWj4fu-2lKMGECpu0TZUsP8M",
    "https://lh3.googleusercontent.com/geougc-cs/AIHozJI0Y0LLu7CJ289smZRbAE0gz5TTvmnYXKST8gMZGgWHo5hSs5gheRAzk9xdPwKcGH_8GApIbVziGKKY6wIHGVmPhsh2e0UkiRUPvyLVApsFdzcYQxaF-0dbJRGG5ipwM1CkjHyc"
    ]

In [7]:

# Load images
images = load_images_from_urls(image_urls)

# Preprocess images as a batch
image_inputs = processor(images=images, return_tensors="pt")

#### Extract Image Features for the Batch

In [8]:
# Extract image features for all images
image_features = model.get_image_features(**image_inputs)

# Normalize image features
image_features = F.normalize(image_features, p=2, dim=-1)


#### Compute Similarity and Get Predictions

In [10]:
# Compute cosine similarity between images and text labels
# similarities = image_features @ text_features.T  # Shape: (num_images, num_categories)
similarities = torch.mm(image_features, text_features.T)

# Get the best match for each image
best_match_indices = similarities.argmax(dim=1)  # Get index of highest similarity per image

# Print predictions
for i, idx in enumerate(best_match_indices):
    print(f"Image {i+1}: Predicted category -> {categories[idx]}")


Image 1: Predicted category -> cakes
Image 2: Predicted category -> pancakes
Image 3: Predicted category -> cakes
Image 4: Predicted category -> cakes
Image 5: Predicted category -> glass of beer
Image 6: Predicted category -> pancakes


---

---

### Option 2 - Batch Classification with ResNet

##### Load Pretrained ResNet Model

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import requests
from io import BytesIO
from tqdm import tqdm

# Load pretrained ResNet
model = models.resnet50(pretrained=True)  # You can use resnet18, resnet34, etc.
model.fc = nn.Identity()  # Remove the final classification layer to get feature vectors
model.eval()  # Set to evaluation mode


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 82.4MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

#### Define Image Transformations

In [2]:
# Define preprocessing transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])


####  Load and Preprocess Multiple Images

In [3]:
def load_images_from_urls(image_urls):
    images = []
    for url in tqdm(image_urls, desc="Downloading images"):
        try:
            response = requests.get(url, timeout=5)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert("RGB")
            images.append(transform(img))
        except requests.exceptions.RequestException as e:
            print(f"Error loading {url}: {e}")
    return torch.stack(images)  # Stack tensors into a batch

# Image URLs
image_urls =  [
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/16/a3/95/6d/photo1jpg.jpg",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/0f/31/fd/bb/photo0jpg.jpg",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/07/eb/18/fc/gateaux-bakery.jpg",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2d/12/a0/b8/caption.jpg",
    "https://lh3.googleusercontent.com/geougc-cs/AIHozJKWOKOWjvG3ZJvXhnZXCdPWx1v92oJfxR5nn3eoNY4nRSnWlvzo6rqWYQ8yBbibRYLR8nZJl_IOZK_khqH08cZtLjksyfcRkspU4P7uZDIl-6_RWj4fu-2lKMGECpu0TZUsP8M",
    "https://lh3.googleusercontent.com/geougc-cs/AIHozJI0Y0LLu7CJ289smZRbAE0gz5TTvmnYXKST8gMZGgWHo5hSs5gheRAzk9xdPwKcGH_8GApIbVziGKKY6wIHGVmPhsh2e0UkiRUPvyLVApsFdzcYQxaF-0dbJRGG5ipwM1CkjHyc",
    "https://lh3.googleusercontent.com/geougc-cs/AIHozJI71fSCXV-F5nQa5AC8lgL0mGQpWu00uRaaFupj87asyjnoLHi8DqoTLtWpHbSq2VhfYYD_8iDbgzYRqL56N6LJcG4tJTfqGeA6ZpGmNaozk5t0UdaARqYg0tY5r_f4iWDsXx1l",
    "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/0e/27/6d/50/img-20170109-wa0000-largejpg.jpg",
    "https://lh3.googleusercontent.com/geougc-cs/AIHozJI8YtwHy1nzEkCExD8SUA_S478P1IDPKP7Vu3-8tWECLEQvcuwOq_m7BPMRyCo-9cN8qzR5hITf50aAEehKR-VYF5VRd7oqCDXY5oaeoAATBpjLB3Js4h4YaMAPMyHZxPjWFu-gYw"
    ]

# Load and preprocess images
image_batch = load_images_from_urls(image_urls)


Downloading images: 100%|██████████| 9/9 [00:03<00:00,  2.64it/s]


#### Extract Features with ResNet

In [6]:
# Pass images through ResNet
with torch.no_grad():
    features = model(image_batch)  # Shape: (batch_size, 2048)


#### Classify Images using a Custom Classifier

In [14]:
import torch.nn.functional as F

# Define category representations (random vectors for now)
categories = ["cake", "cupcakes", "beer", "coffee","not recognize", "egges"]
category_vectors = torch.randn(len(categories), 2048)  # Fake embeddings, replace with trained ones

# Compute cosine similarity between images and categories
features = F.normalize(features, p=2, dim=-1)
category_vectors = F.normalize(category_vectors, p=2, dim=-1)
similarities = features @ category_vectors.T  # Shape: (batch_size, num_categories)
# similarities = torch.matmul(features, category_vectors.T)


# Get the best match for each image
best_match_indices = similarities.argmax(dim=1)

# Print results
for i, idx in enumerate(best_match_indices):
    print(f"Image {i+1}: Predicted category -> {categories[idx]}")


Image 1: Predicted category -> beer
Image 2: Predicted category -> cake
Image 3: Predicted category -> cake
Image 4: Predicted category -> beer
Image 5: Predicted category -> cake
Image 6: Predicted category -> beer
Image 7: Predicted category -> cake
Image 8: Predicted category -> beer
Image 9: Predicted category -> beer
