This notebook contains the code for generating the captions of CIRR training set images using either BLIP or BLIP2 models.

In [None]:
from PIL import Image
import torch
import os
from tqdm import tqdm

In [None]:
# load the drive (if using colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Function to generate captions for a batch of images
def generate_captions_batch(image_paths):
    """
    Generate captions for a batch of images using BLIP.

    Parameters:
        image_paths (list of str): List of paths to the image files in the batch.

    Returns:
        list of str: List of generated captions for the images in the batch.
    """
    # Open all images and preprocess them
    images = [Image.open(path).convert("RGB") for path in image_paths]
    inputs = processor(images=images, return_tensors="pt", padding=True).to(device)

    # Generate captions
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=32)

    # Decode captions
    captions = [processor.decode(output, skip_special_tokens=True) for output in outputs]
    return captions


# Function to process all images in batches
def process_images_in_batches(image_folder, batch_size=16):
    """
    Process a large dataset of images in batches to generate captions.

    Parameters:
        image_folder (str): Path to the folder containing image files.
        batch_size (int): Number of images per batch.

    Returns:
        dict: Dictionary with image file names as keys and captions as values.
    """
    # Get a list of all image files in the folder and subfolders
    dir_list = os.listdir(image_folder)
    image_paths = []
    for dir in tqdm(dir_list):
      images_sub_dir = os.listdir(os.path.join(image_folder, dir))
      for img in images_sub_dir:
        image_paths.append(os.path.join(image_folder, dir, img))
    
    print(len(image_paths))
    # Dictionary to store captions
    captions_dict = {}

    # Process images in batches
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Processing batches"):
        batch_paths = image_paths[i:i + batch_size]
        captions = generate_captions_batch(batch_paths)
        captions_dict.update({os.path.basename(path): caption for path, caption in zip(batch_paths, captions)})

    return captions_dict

In [None]:
# path to training set
image_folder = "drive/MyDrive/CIRR/images/train/"
batch_size = 64

### Generating captions with BLIP

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
captions = process_images_in_batches(image_folder, batch_size=batch_size)
# Save captions to a file
with open("drive/MyDrive/CIRR/train_captions_BLIP.json", "w") as f:
    import json
    json.dump(captions, f, indent=4)
print("Captions saved to captions.json")

In [12]:
len(captions)

16939

### Generating captions with BLIP2

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16
) 

In [None]:
captions = process_images_in_batches(image_folder, batch_size=batch_size)
# Save captions to a file
with open("drive/MyDrive/CIRR/train_captions_blip2.json", "w") as f:
    import json
    json.dump(captions, f, indent=4)
print("Captions saved to captions.json")

100%|██████████| 100/100 [00:00<00:00, 485.48it/s]


16939


Processing batches: 100%|██████████| 265/265 [3:18:00<00:00, 44.83s/it]

Captions saved to captions.json





In [34]:
captions_set = set()

for key, value in captions.items():
    captions_set.add(value)
print(f"There are : {len(captions_set)} unique captions")

There are : 14810 unique captions
