In [None]:
import torch
from tqdm import tqdm
import json


class InferencePipeline:
    def __init__(self, model, processor, device):
        self.model = model
        self.processor = processor
        self.device = device

    def run_inference(self, dataset, task, max_samples=None):
        if task == "image_captioning":
            return self._run_image_captioning(dataset, max_samples)
        else:
            raise ValueError(f"Unsupported task: {task}")

    def _run_image_captioning(self, dataset, max_samples):
        results = []
        references = []

        for i in tqdm(range(min(len(dataset), max_samples or len(dataset)))):
            image = dataset[i][0]
            captions = dataset[i][1]
            img_id = dataset.ids[i]
            inputs = self.processor(images=image, return_tensors="pt").to(self.device)
            with torch.no_grad():
                out = self.model.generate(**inputs)

            caption = self.processor.decode(out[0], skip_special_tokens=True).strip()

            results.append({"image_id": img_id, "caption": caption})
            references.append(captions)

        return {"predictions": results, "references": references}

    def save_results(self, results, filename):
        with open(filename, "w") as f:
            json.dump(results, f, indent=2)

In [None]:
import json
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice


class ScoringPipeline:
    def __init__(self):
        self.tokenizer = PTBTokenizer()
        self.caption_scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr"),
            (Spice(), "SPICE"),
        ]

    def load_results(self, filename):
        with open(filename, "r") as f:
            return json.load(f)

    def compute_scores(self, results, task):
        if task == "image_captioning":
            return self._compute_image_captioning_scores(results)
        else:
            raise ValueError(f"Unsupported task: {task}")

    def _compute_image_captioning_scores(self, results):
        gts = {i: [{"caption": c} for c in ref] for i, ref in enumerate(results["references"])}
        res = {i: [{"caption": p["caption"]}] for i, p in enumerate(results["predictions"])}

        print("Tokenizing...")
        gts_tokenized = self.tokenizer.tokenize(gts)
        res_tokenized = self.tokenizer.tokenize(res)

        scores = {}
        print("Computing scores...")
        for scorer, method in self.caption_scorers:
            print(f"Computing {method} score...")
            score, scores_per_caption = scorer.compute_score(gts_tokenized, res_tokenized)
            if isinstance(method, list):
                for sc, scs, m in zip(score, scores_per_caption, method):
                    scores[m] = sc
                    scores[f"{m}_per_caption"] = scs
            else:
                scores[method] = score
                scores[f"{method}_per_caption"] = scores_per_caption

        return scores

In [None]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import COCODataset
from tqdm import tqdm
from PIL import Image
from torch.utils.data import DataLoader
from utils import print_model_structure

# Set up the model, processor, and dataset as before
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(model_name)
model = model.to(device)
coco_dataset = COCODataset(
    ann_file="./data/coco/annotations/captions_val2017.json",
    img_dir="./data/coco/val2017",
)

In [None]:
# Run inference
inferencer = InferencePipeline(model, processor, device)
print("Starting inference...")
results = inferencer.run_inference(coco_dataset, task="image_captioning", max_samples=20)
inferencer.save_results(results, "./results/coco_quantized_inference.json")

# Compute scores
scorer = ScoringPipeline()
loaded_results = scorer.load_results("./results/coco_quantized_inference.json")
scores = scorer.compute_scores(loaded_results, task="image_captioning")

# Print scores
for metric, score in scores.items():
    if not metric.endswith("_per_caption"):
        print(f"{metric}: {score}")