In [9]:
!pip install datasets evaluate rouge_score absl-py

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
from datasets import load_dataset, Dataset
from evaluate import load
import nltk
from itertools import islice

import os
os.environ["WANDB_DISABLED"] = "true"

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=46a34feff4da2498e63763c07113395b868c76adfaf66d8466e3aa5990fbedee
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
processor = BlipProcessor.from_pretrained("dmccarthy1145/BLIP-Instagram")
model = BlipForConditionalGeneration.from_pretrained("dmccarthy1145/BLIP-Instagram")

if torch.cuda.is_available():
    model.to("cuda")

ds = load_dataset("Obscure-Entropy/ImageCaptioning_SmallParquets",
                  split="train",
                  streaming=True)

# Preprocessing function
def preprocess(example):
    # Convert image to RGB in case it's not in that format
    image = example["img"].convert("RGB")
    caption = example["en_cap"]  # Get the caption text

    # Use the processor to process both image and caption
    inputs = processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True, max_length=64)

    # Return pixel_values (processed image tensor) and input_ids (tokenized caption)
    return {
        "pixel_values": inputs["pixel_values"].squeeze(0),  # Image tensor
        "input_ids": inputs["input_ids"].squeeze(0),        # Tokenized caption
        "labels": inputs["input_ids"].squeeze(0)            # Tokenized caption
    }

def greedy_caption(model, processor, image):
    inputs = processor(images=image, return_tensors="pt").to(model.device)

    out_greedy = model.generate(**inputs)
    return processor.decode(out_greedy[0], skip_special_tokens=True)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/471 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/975 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [10]:
# Load evaluation metrics
rouge = load("rouge")
bleu = load("bleu")

# Take evaluation dataset and apply preprocessing
eval_ds = list(islice(ds, 10000, 11000))
eval_dataset = [preprocess(sample) for sample in eval_ds]

print(type(eval_dataset))

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

<class 'list'>


In [18]:
predictions = []
references = []

for i, row in enumerate(eval_ds):
    image = row["img"]
    pred = greedy_caption(model, processor, image)

    predictions.append(pred)
    references.append(row["en_cap"])

    print(f"Prediction {i}/{len(eval_ds)}: {pred}")

Prediction 0/100: the image captures a vibrant scene at what appears to be a amusement park or amusement park. two children
Prediction 1/100: the image captures a vibrant scene at dusk on a farm or farmyard. the house on the left
Prediction 2/100: the image captures a close - up view of a basket filled with small fish, possibly small fish or
Prediction 3/100: the image captures a vibrant scene inside a grocery store or grocery store. the store ' s interior is
Prediction 4/100: the image presents a close - up view of a pile of oatmeal, which appears to
Prediction 5/100: the image captures a moment between two individuals who are engaged in what appears to be a maternity photo or
Prediction 6/100: the image showcases two bicycles parked side by side on a black surface at what appears to be a trade
Prediction 7/100: the image features a man standing on a blue platform with a white background. he is dressed in a
Prediction 8/100: the image captures a vibrant scene at a swimming pool with a

In [19]:
# Verify references and predictions
print("References:")
print(references)

print("\nPredictions:")
print(predictions)

References:
["The image captures a lively scene at what appears to be a carnival or amusement park. Two children are seated on a vibrant green and purple ride with yellow accents. The child on the left is wearing a white shirt paired with blue shorts, while the child on the right sports a black shirt with white shorts. Both children are actively engaged with their surroundings, their arms raised high in excitement as they enjoy the ride. The ride itself is situated on a track, suggesting it's part of a larger amusement park setup. The background reveals a clear sky, indicating good weather for outdoor activities. The overall atmosphere suggests a fun and joyful day out for these two children.", "The image depicts a tranquil rural scene at dusk. A prominent blue house with white trim stands on a grassy lawn, its windows glowing with warm light. To the left of the house, there's a red barn with white doors, also illuminated from within. A black car is parked in front of the barn, adding 

In [20]:
# Calculate ROUGE scores
rouge_result = rouge.compute(predictions=predictions, references=references)
print("ROUGE:")
for k, v in rouge_result.items():
    print(f"{k}: {v:.4f}")

bleu_result = bleu.compute(predictions=predictions, references=references)
print("\nBLEU:")
print(f"BLEU: {bleu_result['bleu']:.4f}")

ROUGE:
rouge1: 0.2275
rouge2: 0.1230
rougeL: 0.1989
rougeLsum: 0.1988

BLEU:
BLEU: 0.0017
