In [None]:
# Zero shot captioning with CLIP
#################################
# Have BLIP generate multiple captions
# Optionally, have GPT enhance these captions and make them more creative or longer
# Feed these captions into CLIP to evaluate which text embedding is most similar to the image embedding
# Give prompts such as "funny" or "professional" for different styles

import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
# from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

if torch.cuda.is_available():
    model.to("cuda")

In [None]:
!pip install datasets
from datasets import load_dataset, Dataset

ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets",
                  split="train",
                  streaming=True)

# ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets", data_files="https://huggingface.co/datasets/Obscure-Entropy/ImageCaptioning_SmallParquets/blob/main/data/gbc_captions_0_100k.parquet")
# ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets", split="train[:10]")

In [None]:
def transform(example):
    # Convert image to RGB in case it's not in that format
    image = example["img"].convert("RGB")
    caption = example["en_cap"]  # Get the caption text

    # Use the processor to process both image and caption
    inputs = processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True, max_length=64)

    # Return pixel_values (processed image tensor) and input_ids (tokenized caption)
    return {
        "pixel_values": inputs["pixel_values"].squeeze(0),  # Image tensor
        "input_ids": inputs["input_ids"].squeeze(0),        # Tokenized caption
        "labels": inputs["input_ids"].squeeze(0)            # Tokenized caption
    }

In [None]:
small_ds = ds.take(10)

processed_ds = [transform(sample) for sample in small_ds]

print(processed_ds[0])

In [None]:
# Apply preorocessing
# dataset = ds.map(transform, remove_columns=["img", "en_cap", "hu_cap"])

# # Set the format for PyTorch
# dataset.set_format(type="torch", columns=["pixel_values", "labels"])

# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=processor.tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./blip-finetuned-captioning",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-6,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="no",
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_ds,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
# Preprocess image and generate multiple captions
captions = []
image = Image.open("1922185828222748745.jpg")
inputs = processor(images=image, return_tensors="pt").to(model.device)

# 1. Greedy decoding
out_greedy = model.generate(**inputs)
captions.append(("Greedy", processor.decode(out_greedy[0], skip_special_tokens=True)))

# 2. Beam search
out_beam = model.generate(**inputs, num_beams=5, num_return_sequences=1)
captions.append(("Beam Search", processor.decode(out_beam[0], skip_special_tokens=True)))

# 3. Top-k sampling
out_topk = model.generate(**inputs, do_sample=True, top_k=50, max_length=50)
captions.append(("Top-k Sampling", processor.decode(out_topk[0], skip_special_tokens=True)))

# 4. Top-p (nucleus) sampling
out_topp = model.generate(**inputs, do_sample=True, top_p=0.9, max_length=50)
captions.append(("Top-p Sampling", processor.decode(out_topp[0], skip_special_tokens=True)))

# Display the results
for method, caption in captions:
    print(f"[{method}] {caption}")


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPTNeoForCausalLM

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
enhancer_model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
enhancer_model.eval()

if torch.cuda.is_available():
    model.to("cuda")

In [None]:
# Enhance captions with GPT
enhanced_captions = []
for method, caption in captions:
    # Set prompt
    prompt = f"Rewrite the following caption to make it more fun, engaging, and suitable for Instagram, keeping the original context intact:\n\"{caption}\"\nImproved:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(enhancer_model.device)

    # Get output
    outputs = enhancer_model.generate(
        input_ids=input_ids,
        max_length=input_ids.shape[1] + 50, # The input prompt + tokens for output
        do_sample=True,
        top_p=0.85,
        temperature=0.5,
        repetition_penalty=1.2,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id, # Stop when the eos token is found
        early_stopping=True
    )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the new caption
    improved = generated.split("Improved:")[-1].strip()
    enhanced_captions.append(improved)

# Print results
for original, enhanced in zip(captions, enhanced_captions):
    print(f"Original: {original}")
    print(f"Enhanced: {enhanced}\n")

In [None]:
import openai

# Initialize client with API key
client = OpenAI(
  api_key="<YOUR API KEY>"
)

In [None]:
def generate_caption(prompt):
    try:
        # Make the API call to generate a caption
        response = client.responses.create(
            model="gpt-4o",  # Specify the model you want to use
            input=f"Can you rewrite this caption to make it more fun and engaging for Instagram?\n{prompt}\nImproved:",
            temperature=0.7,
            top_p=0.9,
            max_output_tokens=50,  # Specify the maximum number of tokens for the generated response
        )

        # Extract the improved caption
        improved_caption = response.output_text

        return improved_caption
    except Exception as e:
        print(f"Error generating caption: {e}")
        return None


In [None]:
def generate_completion(prompt):
  completion = client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
      {"role": "user", "content": f"Can you rewrite this caption to make it more fun and engaging for Instagram?\n{prompt}"}
    ]
  )

  return completion.choices[0].message;

In [None]:
# Take the greedy caption
caption = "there are two children laying on a bed holding a small piece of jewelry"

enhanced_caption = generate_completion(caption)

print(enhanced_caption)

In [None]:
print(enhanced_caption.content)

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np

def load_clip():
    """
    Loads a pretrained CLIP model and processor

    Returns: model, processor
    """
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    if torch.cuda.is_available():
        clip_model.to("cuda")

    return clip_model, clip_processor

def select_best_caption_with_clip(clip_model, clip_processor, image_path, candidate_captions):
    """
    Selects the caption most similar to the image using CLIP.

    Args:
        clip_model: pretrained CLIP model
        clip_processor: CLIP processor to use
        image_path (str): Path to the input image.
        candidate_captions (List[str]): List of caption strings.

    Returns:
        (best_caption, all_scores): Tuple of the best caption and all similarity scores.
    """
    # Load and process image
    image = Image.open(image_path).convert("RGB")

    # Tokenize inputs
    inputs = clip_processor(text=candidate_captions, images=image, return_tensors="pt", padding=True).to(clip_model.device)

    # Get image/text embeddings
    with torch.no_grad():
        outputs = clip_model(**inputs)
        image_embeds = outputs.image_embeds  # shape: (1, 512)
        text_embeds = outputs.text_embeds    # shape: (num_captions, 512)

    # Normalize
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    # Cosine similarity
    similarity_scores = (image_embeds @ text_embeds.T).squeeze(0)  # shape: (num_captions,)

    # Select best
    best_idx = similarity_scores.argmax().item()
    best_caption = candidate_captions[best_idx]

    return best_caption, similarity_scores.tolist()

In [None]:
clip_model, clip_processor = load_clip()

best_caption, scores = select_best_caption_with_clip(clip_model, clip_processor, "1922185828222748745.jpg", captions)

print("Best Caption:", best_caption)
print("Scores:", scores)