In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

from dataset import VQAv2Eval
# from inference_pipeline import InferencePipeline
# import time
# from scoring_pipeline import ScoringPipeline

from dataset import VQAv2Eval

# import os
from awq.llava_quantizer import LlavaAWQQuantizer
from transformers.models.llava.image_processing_llava import LlavaImageProcessor

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

In [None]:
# VQAv2 dataset paths
ann_root = "./data/vqav2/annotations"
q_root = "./data/vqav2/questions"
image_root = "./data/vqav2/val2014"

llava_prompt = "USER: <image>\n{}\nAnswer the question using a single word or phrase. ASSISTANT:"

dataset = VQAv2Eval(image_root=image_root, ann_root=ann_root, q_root=q_root, prompt=llava_prompt)

In [None]:
# Load the model
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", pad_token="<pad>", use_fast=False)
# need to use this image processor w/ do_pad=True according to "Note regarding reproducing original implementation"
# https://huggingface.co/docs/transformers/en/model_doc/llava
image_processor = LlavaImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", do_pad=True)

processor.image_processor = image_processor

model.to(device)

In [None]:
# FP output
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "url": "https://www.ilankelman.org/stopsigns/australia.jpg",
            },
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device, torch.float16)

with torch.no_grad():
    # Generate
    generate_ids = model.generate(**inputs, max_new_tokens=30)

    print(processor.batch_decode(generate_ids, skip_special_tokens=True))

In [None]:
config = {}

# config['vision_layers'] = {
#     'self_attn':16,
#     'mlp': 16
# }

config["llm_layers"] = {"self_attn": 4, "mlp": 4}

config

In [None]:
quantizer = LlavaAWQQuantizer(model, device, processor, dataset, config)

In [None]:
quantizer.n_samples = 128

In [None]:
print(quantizer.n_samples)

In [None]:
model.to(device)
quantizer.quantize()

In [None]:
dataset[100]["image"]

In [None]:
img = dataset[100]["image"]
prompt = (
    "USER: <image>\n"
    + dataset.qa_pairs[100]["question"]
    + "\nAnswer the question using a single word or phrase. ASSISTANT:"
)

print(prompt)

In [None]:
model = model.to("cuda")
samples = processor(images=[img], text=[prompt], return_tensors="pt", padding=True).to(model.device)

samples.keys()

In [None]:
# Generate
# generate_ids = model.generate(**inputs, max_new_tokens=30)
generate_ids = model.generate(**samples)
processor.batch_decode(generate_ids, skip_special_tokens=True)

In [None]:
from torch.utils.data import DataLoader
from inference_pipeline import InferencePipeline

dataset.set_max_samples(10)

dataloader = DataLoader(
    dataset,
    batch_size=16,
    num_workers=1,
    pin_memory=False,
    shuffle=False,
    collate_fn=dataset.collater,
)

inferencer = InferencePipeline(model, device, processor)

# set this according to huggingface usage tips: https://huggingface.co/docs/transformers/en/model_doc/llava
processor.tokenizer.padding_side = "left"
processor_kwargs = dict(padding=True)

# greedy decoding
generate_kwargs = {"num_beams": 1, "do_sample": False}

results = inferencer.run_inference(
    dataloader,
    task="vqav2",
    processor_kwargs=processor_kwargs,
    generate_kwargs=generate_kwargs,
)

In [None]:
results

In [None]:
results