In [1]:
# Zero shot captioning with CLIP
#################################
# Have BLIP generate multiple captions
# Optionally, have GPT enhance these captions and make them more creative or longer
# Feed these captions into CLIP to evaluate which text embedding is most similar to the image embedding
# Give prompts such as "funny" or "professional" for different styles

import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
# from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image

In [2]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

if torch.cuda.is_available():
    model.to("cuda")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

In [3]:
!pip install datasets
from datasets import load_dataset, Dataset

ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets",
                  split="train",
                  streaming=True)

# ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets", data_files="https://huggingface.co/datasets/Obscure-Entropy/ImageCaptioning_SmallParquets/blob/main/data/gbc_captions_0_100k.parquet")
# ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets", split="train[:10]")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [8]:
def transform(example):
    # Convert image to RGB in case it's not in that format
    image = example["img"].convert("RGB")
    caption = example["en_cap"]  # Get the caption text

    # Use the processor to process both image and caption
    inputs = processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True, max_length=64)

    # Return pixel_values (processed image tensor) and input_ids (tokenized caption)
    return {
        "pixel_values": inputs["pixel_values"].squeeze(0),  # Image tensor
        "input_ids": inputs["input_ids"].squeeze(0),        # Tokenized caption
        "labels": inputs["input_ids"].squeeze(0)            # Tokenized caption
    }

In [9]:
small_ds = ds.take(10)

processed_ds = [transform(sample) for sample in small_ds]

print(processed_ds[0])

{'pixel_values': tensor([[[1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
         [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
         [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
         ...,
         [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
         [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
         [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303]],

        [[2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
         [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
         [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
         ...,
         [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
         [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
         [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749]],

        [[2.1459, 2.1459, 2.1459,  ..., 2.1459, 2.1459, 2.1459],
         [2.1459, 2.1459, 2.1459,  ..., 2.1459, 2.1459, 2.1459],
         [2.1459, 2.1459, 2.1459,  ..., 2

In [10]:
# Apply preorocessing
# dataset = ds.map(transform, remove_columns=["img", "en_cap", "hu_cap"])

# # Set the format for PyTorch
# dataset.set_format(type="torch", columns=["pixel_values", "labels"])

# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=processor.tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./blip-finetuned-captioning",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-6,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="no",
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_ds,
    data_collator=data_collator
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


OutOfMemoryError: CUDA out of memory. Tried to allocate 164.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 106.12 MiB is free. Process 2375 has 14.63 GiB memory in use. Of the allocated memory 14.24 GiB is allocated by PyTorch, and 273.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
# Preprocess image and generate multiple captions
captions = []
image = Image.open("1922185828222748745.jpg")
inputs = processor(images=image, return_tensors="pt").to(model.device)

# 1. Greedy decoding
out_greedy = model.generate(**inputs)
captions.append(("Greedy", processor.decode(out_greedy[0], skip_special_tokens=True)))

# 2. Beam search
out_beam = model.generate(**inputs, num_beams=5, num_return_sequences=1)
captions.append(("Beam Search", processor.decode(out_beam[0], skip_special_tokens=True)))

# 3. Top-k sampling
out_topk = model.generate(**inputs, do_sample=True, top_k=50, max_length=50)
captions.append(("Top-k Sampling", processor.decode(out_topk[0], skip_special_tokens=True)))

# 4. Top-p (nucleus) sampling
out_topp = model.generate(**inputs, do_sample=True, top_p=0.9, max_length=50)
captions.append(("Top-p Sampling", processor.decode(out_topp[0], skip_special_tokens=True)))

# Display the results
for method, caption in captions:
    print(f"[{method}] {caption}")


[Greedy] there are two children laying on a bed holding a small piece of jewelry
[Beam Search] there are two young girls laying on a bed holding something in their hands
[Top-k Sampling] they wear matching clothing for little girls to wear with the necklace
[Top-p Sampling] laying next to each other child holding a silver scissors next to each other


In [17]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPTNeoForCausalLM

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
enhancer_model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
enhancer_model.eval()

if torch.cuda.is_available():
    model.to("cuda")

In [20]:
# Enhance captions with GPT
enhanced_captions = []
for method, caption in captions:
    # Set prompt
    prompt = f"Rewrite the following caption to make it more fun, engaging, and suitable for Instagram, keeping the original context intact:\n\"{caption}\"\nImproved:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(enhancer_model.device)

    # Get output
    outputs = enhancer_model.generate(
        input_ids=input_ids,
        max_length=input_ids.shape[1] + 50, # The input prompt + tokens for output
        do_sample=True,
        top_p=0.85,
        temperature=0.5,
        repetition_penalty=1.2,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id, # Stop when the eos token is found
        early_stopping=True
    )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the new caption
    improved = generated.split("Improved:")[-1].strip()
    enhanced_captions.append(improved)

# Print results
for original, enhanced in zip(captions, enhanced_captions):
    print(f"Original: {original}")
    print(f"Enhanced: {enhanced}\n")



Original: ('Greedy', 'there are two children laying on a bed holding a small piece of jewelry')
Enhanced: "there is a man on the bed with a large amount of money"

Original: ('Beam Search', 'there are two young girls laying on a bed holding something in their hands')
Enhanced: "There are 2 girls holding a book in one hand and a phone in the other hand"

Original: ('Top-k Sampling', 'female and little girl laying on a bed making a funny face')
Enhanced: "female lying on bed with a blanket on her face making funny faces" (not really funny, but still funny)

I have a feeling this is going to be a very long post.
So I'm going ahead and doing it anyway

Original: ('Top-p Sampling', 'there is a girl with a bow around her wrists laying next to a little girl')
Enhanced: "there are two girls laying on the ground next" (not sure if this is the right word)

I've been trying to figure out how to write this caption for the past few days, but I can't seem to get it right

Original: ('Temperature Sam

In [31]:
import openai

# Initialize client with API key
client = OpenAI(
  api_key="<YOUR API KEY>"
)

In [32]:
def generate_caption(prompt):
    try:
        # Make the API call to generate a caption
        response = client.responses.create(
            model="gpt-4o",  # Specify the model you want to use
            input=f"Can you rewrite this caption to make it more fun and engaging for Instagram?\n{prompt}\nImproved:",
            temperature=0.7,
            top_p=0.9,
            max_output_tokens=50,  # Specify the maximum number of tokens for the generated response
        )

        # Extract the improved caption
        improved_caption = response.output_text

        return improved_caption
    except Exception as e:
        print(f"Error generating caption: {e}")
        return None


In [33]:
def generate_completion(prompt):
  completion = client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
      {"role": "user", "content": f"Can you rewrite this caption to make it more fun and engaging for Instagram?\n{prompt}"}
    ]
  )

  return completion.choices[0].message;

In [30]:
# Take the greedy caption
caption = "there are two children laying on a bed holding a small piece of jewelry"

enhanced_caption = generate_completion(caption)

print(enhanced_caption)

ChatCompletionMessage(content='"Two little treasure hunters snuggled up with their shiny find! Who says you can\'t discover a little magic right in bed? ✨💍 #JewelryAdventures #CozyFinds #SiblingShenanigans"', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)


In [34]:
print(enhanced_caption.content)

"Two little treasure hunters snuggled up with their shiny find! Who says you can't discover a little magic right in bed? ✨💍 #JewelryAdventures #CozyFinds #SiblingShenanigans"


In [6]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np

def load_clip():
    """
    Loads a pretrained CLIP model and processor

    Returns: model, processor
    """
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    if torch.cuda.is_available():
        clip_model.to("cuda")

    return clip_model, clip_processor

def select_best_caption_with_clip(clip_model, clip_processor, image_path, candidate_captions):
    """
    Selects the caption most similar to the image using CLIP.

    Args:
        clip_model: pretrained CLIP model
        clip_processor: CLIP processor to use
        image_path (str): Path to the input image.
        candidate_captions (List[str]): List of caption strings.

    Returns:
        (best_caption, all_scores): Tuple of the best caption and all similarity scores.
    """
    # Load and process image
    image = Image.open(image_path).convert("RGB")

    # Tokenize inputs
    inputs = clip_processor(text=candidate_captions, images=image, return_tensors="pt", padding=True).to(clip_model.device)

    # Get image/text embeddings
    with torch.no_grad():
        outputs = clip_model(**inputs)
        image_embeds = outputs.image_embeds  # shape: (1, 512)
        text_embeds = outputs.text_embeds    # shape: (num_captions, 512)

    # Normalize
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    # Cosine similarity
    similarity_scores = (image_embeds @ text_embeds.T).squeeze(0)  # shape: (num_captions,)

    # Select best
    best_idx = similarity_scores.argmax().item()
    best_caption = candidate_captions[best_idx]

    return best_caption, similarity_scores.tolist()

In [8]:
clip_model, clip_processor = load_clip()

best_caption, scores = select_best_caption_with_clip(clip_model, clip_processor, "1922185828222748745.jpg", captions)

print("Best Caption:", best_caption)
print("Scores:", scores)

Best Caption: ('Greedy', 'there are two children laying on a bed holding a small piece of jewelry')
Scores: [0.18859055638313293, 0.1702822744846344, 0.18619632720947266, 0.1824672669172287]
