In [None]:
import sys

sys.path.append("..")

from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers.models.llava.image_processing_llava import LlavaImageProcessor
import transformers

from dataset import VQAv2Eval, GQAEval
from inference_pipeline import InferencePipeline

import torch
from torch.utils.data import DataLoader

In [None]:
ann_root = "./data/vqav2/annotations"
q_root = "./data/vqav2/questions"
image_root = "./data/vqav2/val2014"
# short answer prompting according to: https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md
llava_prompt = "USER: <image>\n{}\nAnswer the question using a single word or phrase. ASSISTANT:"

dataset = VQAv2Eval(image_root=image_root, ann_root=ann_root, q_root=q_root, prompt=llava_prompt)

# dataset.set_max_samples(21435)

len(dataset)

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16)
model.to("cuda")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", pad_token="<pad>", use_fast=False)

# need to use this image processor w/ do_pad=True according to "Note regarding reproducing original implementation"
# https://huggingface.co/docs/transformers/en/model_doc/llava
image_processor = LlavaImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", do_pad=True)

processor.image_processor = image_processor

In [None]:
# short answer prompting according to: https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md
llava_prompt = "USER: <image>\n{}\nAnswer the question using a single word or phrase. ASSISTANT:"

# GQA dataset paths
image_root = "./data/gqa/images"
q_root = "./data/gqa/questions"

dataset = GQAEval(image_root, q_root, prompt=llava_prompt)

In [None]:
dataloader = DataLoader(
    dataset,
    batch_size=16,
    num_workers=1,
    pin_memory=False,
    shuffle=False,
    collate_fn=dataset.collater,
)

In [None]:
inferencer = InferencePipeline(model, device, processor)

# set this according to huggingface usage tips: https://huggingface.co/docs/transformers/en/model_doc/llava
processor.tokenizer.padding_side = "left"
processor_kwargs = dict(padding=True)

# greedy decoding
# generate_kwargs = {
#     'num_beams': 1,
#     'do_sample': False
# }

results = inferencer.run_inference(dataloader, task="gqa", processor_kwargs=processor_kwargs, generate_kwargs=None)

results

In [None]:
from scoring_pipeline import ScoringPipeline

for res in results:
    res["answer"] = res["answer"].split("ASSISTANT: ")[-1]


def compute_gqa_results(results, scorer, save_path=None):
    gqa_results = scorer.compute_scores(results, "gqa")
    print(gqa_results)


# if save_path:
#     with open(save_path, "w") as f:
#         json.dump(gqa_results, f)

scorer = ScoringPipeline()
compute_gqa_results(results, scorer)