# Download the Flickr8k dataset from Kaggle

In [2]:
!pip install kaggle
!pip install kagglehub

Collecting kaggle
  Obtaining dependency information for kaggle from https://files.pythonhosted.org/packages/14/83/7f29c7abe0d5dc769dad7da993382c3e4239ad63e1dd58414d129e0a4da2/kaggle-1.7.4.5-py3-none-any.whl.metadata
  Downloading kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Collecting python-slugify (from kaggle)
  Obtaining dependency information for python-slugify from https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl.metadata
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode (from kaggle)
  Obtaining dependency information for text-unidecode from https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl.metadata
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)

In [6]:
import kagglehub
# Authenticate Kaggle API and download the Flickr8k dataset
import os
from kaggle.api.kaggle_api_extended import KaggleApi

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Define the dataset and download path
dataset_flickr8k = "adityajn105/flickr8k"
download_path_flickr8k = "./flickr8k"

# Download and extract the dataset
if not os.path.exists(download_path_flickr8k):
    os.makedirs(download_path_flickr8k)

api.dataset_download_files(dataset_flickr8k, path=download_path_flickr8k, unzip=True)

print("Flickr8k dataset downloaded and extracted to:", download_path_flickr8k)

# Download the Flickr30K Image Dataset from Kaggle
# Define the dataset and download path
dataset_flickr30k = "hsankesara/flickr-image-dataset"
download_path_flickr30k = "./flickr30k"

# Download and extract the dataset
if not os.path.exists(download_path_flickr30k):
    os.makedirs(download_path_flickr30k)

api.dataset_download_files(dataset_flickr30k, path=download_path_flickr30k, unzip=True)

print("Flickr30K dataset downloaded and extracted to:", download_path_flickr30k)

Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
Flickr8k dataset downloaded and extracted to: ./flickr8k
Dataset URL: https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset
Flickr30K dataset downloaded and extracted to: ./flickr30k


# Generate embeddings for images in the Flickr8k dataset using a pretrained CLIP model

In [7]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load model and processor once
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_clip_embedding(image_path):
    """Returns the CLIP embedding for an image."""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    return outputs.squeeze().numpy()

def get_text_clip_embedding(text):
    """Returns the CLIP embedding for a text string."""
    inputs = processor(text=[text], return_tensors="pt")
    with torch.no_grad():
        outputs = model.get_text_features(**inputs)
    return outputs.squeeze().numpy()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [13]:
# how to load the kaggle fickr dataset and use the above methods to generate embeddings for each image and text

import os
import json
from tqdm import tqdm

# Define paths
flickr8k_images_path = "./flickr8k/Images"
flickr8k_captions_path = "./flickr8k/captions.txt"

# Load captions
def load_captions(captions_file):
    """Loads captions from the Flickr8k captions file."""
    captions = {}
    with open(captions_file, "r") as file:
        next(file)  # Skip the header
        for line in file:
            image_name, caption = line.strip().split(",",1)
            #image_name = image_name.split("#")[0]  # Remove the #index
            if image_name not in captions:
                captions[image_name] = []
            captions[image_name].append(caption)
    return captions

# Generate embeddings
def generate_embeddings(images_path, captions, output_path="./embeddings.json"):
    """Generates and saves embeddings for images and captions."""
    embeddings = {}
    for image_name, captions_list in tqdm(captions.items(), desc="Processing images"):
        image_path = os.path.join(images_path, image_name)
        if os.path.exists(image_path):
            # Generate image embedding
            image_embedding = get_image_clip_embedding(image_path)
            # Generate text embeddings for all captions
            text_embeddings = [get_text_clip_embedding(caption) for caption in captions_list]
            # Store embeddings
            embeddings[image_name] = {
                "image_embedding": image_embedding.tolist(),
                "text_embeddings": [embedding.tolist() for embedding in text_embeddings],
            }
    # Save embeddings to a JSON file
    with open(output_path, "w") as file:
        json.dump(embeddings, file)
    print(f"Embeddings saved to {output_path}")

# Load captions
captions = load_captions(flickr8k_captions_path)

# Generate and save embeddings
generate_embeddings(flickr8k_images_path, captions)

Processing images: 100%|██████████| 8091/8091 [16:31<00:00,  8.16it/s]


Embeddings saved to ./embeddings.json


In [1]:
# Load the generated embeddings for text_embeddings from the JSON file in memory
# And take an embedding of an image and then find the most similar text embedding  based on cosine similarity, dumping the cosine similarity value per text embedding in a file
