In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from transformers import (
    Blip2Processor,
    Blip2ForConditionalGeneration,
    AutoProcessor,
    Blip2ForImageTextRetrieval,
)
from dataset import COCODataset
from awq.quantizer import (
    Blip2ForConditionalGenerationAWQQuantizer,
    Blip2ForImageTextRetrievalAWQQuantizer,
)
from inference_pipeline import InferencePipeline
import time
from scoring_pipeline import ScoringPipeline

from dataset import Flickr30kEvalDataset
import torchvision.transforms as transforms

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

In [None]:
def model_size(model):
    # returns all layers of model
    def get_layers(model):
        children = list(model.children())
        return [model] if len(children) == 0 else [ci for c in children for ci in get_layers(c)]

    layers = get_layers(model)
    size = 0

    # model params
    for layer in layers:
        for name, param in layer.named_parameters():
            #  NOTE: element_size in bits
            element_size = param.element_size() * 8
            size += param.nelement() * element_size

    # model buffers (not quantized)
    for buffer in model.buffers():
        size += buffer.nelement() * (buffer.element_size() * 8)

    # bits --> megabytes
    size /= 8e6
    return size

## AWQ Blip-2 Caption Generation

In [None]:
# SETUP
model_name = "Salesforce/blip2-opt-2.7b"
model = Blip2ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

In [None]:
model_size(model)

In [None]:
processor = Blip2Processor.from_pretrained(model_name)

# NOTE: set paths as appropriate
# Will sample n_samples from dataset to create calibration set
coco_dataset = COCODataset(
    ann_file="./data/cocow/annotations/captions_val2017.json",
    img_dir="./data/cocow/images/val2017",
)

In [None]:
coco_dataset[0][0]

In [None]:
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(coco_dataset, task="image_captioning", max_samples=1)
results

{'predictions': [{'image_id': 397133,
   'caption': 'a woman in a kitchen with a man in a kitchen'}],
 'references': [['A man is in a kitchen making pizzas.',
   'Man in apron standing on front of oven with pans and bakeware',
   'A baker is working in the kitchen rolling dough.',
   'A person standing by a stove in a kitchen.',
   'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.']]}

In [None]:
# full precision
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(coco_dataset, task="image_captioning")
results

In [None]:
scorer = ScoringPipeline()
scores = scorer.compute_scores(results, task="image_captioning")

for metric, score in scores.items():
    if not metric.endswith("_per_caption"):
        print(f"{metric}: {score}")

In [None]:
# sample config, load from JSON or smth
# model_part: bit_width

config = {}
config["vit_layers"] = {
    "self_attn": 4,
    "self_attn_output": 4,
    "fc1": 4,
    "fc2": 4,
}

config["qformer_layers"] = {
    "self_attn": 4,
    "self_attn_output": 4,
    "intermediate_query": 4,
    "output_query": 4,
    "cross_attn": 4,
    "cross_attn_output": 4,
}

config["llm_layers"] = {"self_attn": 4, "self_attn_output": 4, "fc1": 4, "fc2": 4}

In [None]:
# Apply AWQ
quantizer = Blip2ForConditionalGenerationAWQQuantizer(model, device, processor, coco_dataset, config)

start = time.time()
quantizer.quantize()
print(f"Quantization time: {time.time() - start:.2f} seconds")

In [None]:
# model size (mb)
quantizer.model_size / 8e6

In [None]:
# NOTE:make sure to move model back to device, quantizing moves layers around to save memory
model.to(device)
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(coco_dataset, task="image_captioning", max_samples=1)
results

In [None]:
results = pipeline.run_inference(coco_dataset, task="image_captioning")

scorer = ScoringPipeline()
scores = scorer.compute_scores(results, task="image_captioning")

for metric, score in scores.items():
    if not metric.endswith("_per_caption"):
        print(f"{metric}: {score}")

## AWQ Blip-2 Info-Retrieval

In [None]:
model_name = "Salesforce/blip2-itm-vit-g-coco"
model = Blip2ForImageTextRetrieval.from_pretrained(model_name)
model.to(device)
processor = AutoProcessor.from_pretrained(model_name)

model

In [None]:
model_size(model)

In [None]:
ann_file = "./data/flickr30k/annotations/test.json"
img_dir = "./data/flickr30k/images/flickr30k-images"

img_transform = transforms.Compose(
    [
        transforms.Resize((364, 364), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ]
)

flickr_dataset = Flickr30kEvalDataset(ann_file, img_dir, img_transform=img_transform)

In [None]:
# full-precision
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(flickr_dataset, task="image_text_retrieval")
scorer = ScoringPipeline()
scores = scorer.compute_scores(results, task="image_text_retrieval")

In [None]:
scores

In [None]:
flickr_dataset = Flickr30kEvalDataset(ann_file, img_dir, img_transform=None)

In [None]:
# NOTE: full 4-bit vit
config = {}
config["vit_layers"] = {
    "self_attn": 4,
    "self_attn_output": 4,
    "fc1": 4,
    "fc2": 4,
}

# config['qformer_layers'] = {
#     'self_attn': 4,
#     'self_attn_output':4,
#     # 'intermediate_txt': 4,
#     # 'output_txt': 4,
#     'intermediate_query':4,
#     'output_query': 4,
#     'cross_attn': 4,
#     'cross_attn_output': 4,
#     # 'vision_proj':4,
#     # 'txt_proj':4,
#     # 'itm_head': 4,
# }

In [None]:
quantizer = Blip2ForImageTextRetrievalAWQQuantizer(model, device, processor, flickr_dataset, config)
quantizer.quantize()

In [None]:
quantizer.model_size / 8e6

In [None]:
model.to(device)
flickr_dataset.img_transform = img_transform

pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(flickr_dataset, task="image_text_retrieval")

In [None]:
"""
    Uniform Quant equivalent results:

    txt_r1:  59.7
    img_r1: 59.82
"""

scoring_pipeline = ScoringPipeline()
retrieval_metrics = scoring_pipeline._compute_retrieval_scores(results)
retrieval_metrics

In [None]:
# NOTE: (mostly) full 4-bit Blip-2
config = {}
config["vit_layers"] = {
    "self_attn": 4,
    "self_attn_output": 4,
    "fc1": 4,
    "fc2": 4,
}

config["qformer_layers"] = {
    "self_attn": 4,
    "self_attn_output": 4,
    "intermediate_txt": 4,
    "output_txt": 4,
    "intermediate_query": 4,
    "output_query": 4,
    "cross_attn": 4,
    "cross_attn_output": 4,
    #     # 'vision_proj':4,
    #     # 'txt_proj':4,
    #     # 'itm_head': 4,
}

In [None]:
quantizer = Blip2ForImageTextRetrievalAWQQuantizer(model, device, processor, flickr_dataset, config)
start = time.time()
quantizer.quantize()
print(f"Quantization time: {time.time() - start:.2f} seconds")

In [None]:
quantizer.model_size / 8e6

In [None]:
model.to(device)
flickr_dataset.img_transform = img_transform
pipeline = InferencePipeline(model, device, processor)
results = pipeline.run_inference(flickr_dataset, task="image_text_retrieval")

In [None]:
"""
    Uniform-Quant equivalent results:
        txt_r1:  18.2
        img_r1: 11.68

"""

scoring_pipeline = ScoringPipeline()
retrieval_metrics = scoring_pipeline._compute_retrieval_scores(results)
retrieval_metrics