Task1: Download the Flickr8K dataset.

You will get it from Kaggle.

After downloading check if you have around 8000 images and 40K around captions.

In [1]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Set output folder
output_dir = 'flickr8k_data'
os.makedirs(output_dir, exist_ok=True)

# Dataset identifier from Kaggle URL
dataset = 'adityajn105/flickr8k'

# Check for a file that should exist after extraction
expected_file = os.path.join(output_dir, 'captions.txt')  # Change if another file is more reliable

if not os.path.exists(expected_file):
    print("Dataset not found locally. Downloading from Kaggle...")
    api.dataset_download_files(dataset, path=output_dir, unzip=True)
    print(f"✅ Download complete! Files saved to: {output_dir}")
else:
    print(f"✅ Dataset already exists. Skipping download. Located at: {output_dir}")

✅ Dataset already exists. Skipping download. Located at: flickr8k_data


**Task 2: Dump and save CLIP embeddings as .pt**

Sample Code:

```
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
```

Load model and processor once
```
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_clip_embedding(image_path):
    """Returns the CLIP embedding for an image."""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    return outputs.squeeze().numpy()

def get_text_clip_embedding(text):
    """Returns the CLIP embedding for a text string."""
    inputs = processor(text=[text], return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_text_features(**inputs)
    return outputs.squeeze().numpy()
```

In [2]:
!pip install transformers
!pip install torch pillow



In [4]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm
import torch
import os

In [5]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_clip_embedding(image_path):
    # Returns the CLIP embedding for an image.
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    return outputs.squeeze().numpy()

def get_text_clip_embedding(text):
    # Returns the CLIP embedding for a text string.
    inputs = processor(text=[text], return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_text_features(**inputs)
    return outputs.squeeze().numpy()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# Define paths
flickr8k_images_path = output_dir + "/Images"
flickr8k_captions_path = output_dir + "/captions.txt"

# Load captions
def load_captions(captions_file):
    # Loads captions from the Flickr8k captions file.
    captions = {}
    with open(captions_file, "r") as file:
        next(file)  # Skip the header
        for line in file:
            image_name, caption = line.strip().split(",",1)
            if image_name not in captions:
                captions[image_name] = []
            captions[image_name].append(caption)
    return captions

# Generate embeddings
def generate_embeddings(images_path, captions, output_path="./embeddings"):
  """Generates and saves individual .pt files for images and captions."""
  os.makedirs(output_path, exist_ok=True)

  for image_name, captions_list in tqdm(captions.items(), desc="Processing images"):
    image_path = os.path.join(images_path, image_name)
    if os.path.exists(image_path):
      base_name = os.path.splitext(image_name)[0]
      
      # Generate and save image embedding
      image_embedding = get_image_clip_embedding(image_path)
      image_embedding_path = os.path.join(output_path, f"{base_name}.pt")
      torch.save(image_embedding, image_embedding_path)

      # Generate and save caption embeddings
      for idx, caption in enumerate(captions_list, start=1):
        text_embedding = get_text_clip_embedding(caption)
        caption_embedding_path = os.path.join(output_path, f"{base_name}_cap_{idx}.pt")
        torch.save(text_embedding, caption_embedding_path)
  print(f"All embeddings saved to '{output_path}'")

# Load captions
captions = load_captions(flickr8k_captions_path)

# Generate and save embeddings
generate_embeddings(flickr8k_images_path, captions)

Processing images:   9%|▉         | 764/8091 [01:23<13:19,  9.17it/s]