# Demo of Blip2 Quantization, Inference, and Scoring

## 1. Load Model and Quantize

In [None]:
import torch
from transformers import Blip2ForConditionalGeneration, Blip2Processor, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging
from PIL import Image
import json
import os
import random

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

# Load BLIP-2 model and processor
model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16)

# Paths to your local COCO dataset
COCO_DIR = "../data/coco"  # Replace with your COCO dataset directory
IMAGE_DIR = os.path.join(COCO_DIR, "val2017")  # Adjust if your image directory is different
ANNOTATION_FILE = os.path.join(
    COCO_DIR, "annotations", "captions_val2017.json"
)  # Adjust if your annotation file is different

# Load COCO annotations
with open(ANNOTATION_FILE, "r") as f:
    annotations = json.load(f)


def get_random_samples(annotations, num_samples=128):
    image_ids = list(set(ann["image_id"] for ann in annotations["annotations"]))
    selected_ids = random.sample(image_ids, num_samples)

    samples = []
    for ann in annotations["annotations"]:
        if ann["image_id"] in selected_ids:
            image_info = next(img for img in annotations["images"] if img["id"] == ann["image_id"])
            samples.append(
                {
                    "image_file": os.path.join(IMAGE_DIR, image_info["file_name"]),
                    "caption": ann["caption"],
                }
            )
            if len(samples) == num_samples:
                break
    return samples


# Get random samples for calibration
calibration_data = get_random_samples(annotations, num_samples=128)


# Process calibration data
def process_example(example):
    image = Image.open(example["image_file"]).convert("RGB")
    inputs = processor(images=image, text=example["caption"], return_tensors="pt", padding=True)
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
    }


examples = [process_example(example) for example in calibration_data]

# Save the language model separately
language_model = model.language_model
language_model_path = "./blip2_language_model"
language_model.save_pretrained(language_model_path)

# Save the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(language_model_path)

# Define quantization config
quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,
    desc_act=False,
)

# Load and quantize the language model
quantized_model = AutoGPTQForCausalLM.from_pretrained(language_model_path, quantize_config)
quantized_model.quantize(examples)

# print_model_structure(model)

## 2. Run Inference on Model and Generate a .json File

In [None]:
from inference_pipeline import InferencePipeline
from dataset import COCODataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
coco_dataset = COCODataset(
    ann_file="./data/coco/annotations/captions_val2017.json",
    img_dir="./data/coco/val2017",
)

inferencer = InferencePipeline(quantized_model, device, processor)
print("Starting inference...")
results = inferencer.run_inference(coco_dataset, task="image_captioning", max_samples=20)
print("Inference Finished, Saving Results...")
inferencer.save_results(results, "./results/coco_quantized_inference.json")

## 3. Score Results from .json File

In [None]:
from scoring_pipeline import ScoringPipeline

scorer = ScoringPipeline()
loaded_results = scorer.load_results("./results/coco_quantized_inference.json")
scores = scorer.compute_scores(loaded_results, task="image_captioning")

for metric, score in scores.items():
    if not metric.endswith("_per_caption"):
        print(f"{metric}: {score}")

## Sample Results

This is not a necessary step but just helps qualitatively understand how the results relate to the captions.

In [None]:
import json

f = open(
    "./results/coco_quantized_inference.json",
)

data = json.load(f)
f.close()

for i in range(0, 5):
    img_id, caption = data["predictions"][i].values()
    references = data["references"][i]
    print(f"Image Id: {img_id}\nPredicted Caption:{caption}")
    print(f"Reference Captions: {' '.join(references)}\n")

### Here's what the first predicted image caption from above looks like:

In [None]:
coco_dataset[0][0]

## Cleanup

In [None]:
import gc

model.to("cpu")
del model, evaluator
gc.collect()
torch.cuda.empty_cache()