# Blip2 COCO Captions

In [3]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import COCODataset
from datasets import COCODataset
from tqdm import tqdm
from PIL import Image
from torch.utils.data import DataLoader

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# Ensure the model is on the correct device
model = model.to(device)

# Load COCO dataset
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


## Collect Inference Results

In [7]:
results = []

for i in tqdm(range(0, min(1000, len(coco_dataset)))):
    image, _ = coco_dataset[i]
    
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs)
    
    caption = processor.decode(out[0], skip_special_tokens=True).strip()
    
    image_id = coco_dataset.ids[i]
    results.append({"image_id": image_id, "caption": caption})

100%|███████████████████████████████████████████████████████████████████████| 1000/1000 [05:13<00:00,  3.19it/s]


## Save Results

In [8]:
import json
with open('./results/coco_results.json', 'w') as f:
    json.dump(results, f)

## Evaluate Results

In [9]:
# get pycocoevalfolder
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the current directory to the Python path
if current_dir not in sys.path:
    sys.path.append(current_dir)

# Verify the path
print(sys.path)

['/home/gautom/Documents/lavis', '/home/gautom/anaconda3/envs/lavis/lib/python38.zip', '/home/gautom/anaconda3/envs/lavis/lib/python3.8', '/home/gautom/anaconda3/envs/lavis/lib/python3.8/lib-dynload', '', '/home/gautom/.local/lib/python3.8/site-packages', '/home/gautom/anaconda3/envs/lavis/lib/python3.8/site-packages']


In [10]:
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.cider.cider import Cider

class SimpleCIDErEval:
    def __init__(self):
        self.tokenizer = PTBTokenizer()
        self.cider_scorer = Cider()

    def evaluate(self, predictions, references):
        # Format the input for the tokenizer
        gts = {i: [{'caption': c} for c in refs] for i, refs in enumerate(references)}
        res = {i: [{'caption': p}] for i, p in enumerate(predictions)}

        # Tokenize
        gts_tokenized = self.tokenizer.tokenize(gts)
        res_tokenized = self.tokenizer.tokenize(res)

        # Compute CIDEr score
        score, scores = self.cider_scorer.compute_score(gts_tokenized, res_tokenized)

        return score, scores

In [13]:
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

f = open('./results/coco_results.json')
results = json.load(f)
f.close()

candidates = [result['caption'] for result in results]
references = [coco_dataset.get_captions(result['image_id']) for result in results]

# Create evaluator
evaluator = SimpleCIDErEval()

overall_score, individual_scores = evaluator.evaluate(candidates, references)

print(f"Overall CIDEr score: {overall_score}")

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


PTBTokenizer tokenized 61766 tokens at 324387.99 tokens per second.
PTBTokenizer tokenized 9242 tokens at 88794.51 tokens per second.


Overall CIDEr score: 1.2852764152502318


In [15]:
from cidereval import cider
cider_scores = cider(candidates, references)

print(f"Average CIDEr score: {cider_scores['avg_score']}")

PTBTokenizer tokenized 61766 tokens at 336821.45 tokens per second.
PTBTokenizer tokenized 9242 tokens at 90941.95 tokens per second.


Average CIDEr score: 1.422928345185765
