In [2]:
import sys
sys.path.append('..')

from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers.models.llava.image_processing_llava import LlavaImageProcessor
import transformers

from dataset import VQAv2Eval, GQAEval
from inference_pipeline import InferencePipeline

import torch
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ann_root = '/fs/cfar-projects/low-bit-vision/datasets/vqav2/annotations'
q_root = '/fs/cfar-projects/low-bit-vision/datasets/vqav2/questions'
image_root = '/fs/cfar-projects/low-bit-vision/datasets/vqav2/val2014'
# short answer prompting according to: https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md
llava_prompt = 'USER: <image>\n{}\nAnswer the question using a single word or phrase. ASSISTANT:'

dataset = VQAv2Eval(image_root=image_root,
                    ann_root=ann_root,
                    q_root=q_root,
                    prompt = llava_prompt)

# dataset.set_max_samples(21435)

len(dataset)


214354

In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16)
model.to('cuda')
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", pad_token = '<pad>', use_fast = False)

# need to use this image processor w/ do_pad=True according to "Note regarding reproducing original implementation"
# https://huggingface.co/docs/transformers/en/model_doc/llava
image_processor = LlavaImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf",
                                                        do_pad=True)

processor.image_processor = image_processor

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 13.18it/s]


In [3]:
# short answer prompting according to: https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md
llava_prompt = 'USER: <image>\n{}\nAnswer the question using a single word or phrase. ASSISTANT:'

# GQA dataset paths
image_root = '/fs/cfar-projects/low-bit-vision/datasets/gqa/images'
q_root = '/fs/cfar-projects/low-bit-vision/datasets/gqa/questions'

dataset = GQAEval(
        image_root,
        q_root,
        prompt=llava_prompt
)

In [4]:
dataloader = DataLoader(dataset,
                        batch_size=16,
                        num_workers=1,
                        pin_memory=False,
                        shuffle=False,
                        collate_fn = dataset.collater)

In [6]:
inferencer = InferencePipeline(model, device, processor)

# set this according to huggingface usage tips: https://huggingface.co/docs/transformers/en/model_doc/llava
processor.tokenizer.padding_side = "left"
processor_kwargs = dict(padding=True)

# greedy decoding
# generate_kwargs = {
#     'num_beams': 1,
#     'do_sample': False
# }

results = inferencer.run_inference(
    dataloader,
    task = 'gqa',
    processor_kwargs = processor_kwargs,
    generate_kwargs = None
)

results

100%|██████████| 787/787 [33:25<00:00,  2.55s/it]


[{'question_id': '201307251',
  'answer': 'USER: \nIs it overcast?\nAnswer the question using a single word or phrase. ASSISTANT: No',
  'gt_answer': 'no'},
 {'question_id': '201640614',
  'answer': 'USER: \nWho is wearing the dress?\nAnswer the question using a single word or phrase. ASSISTANT: Lady',
  'gt_answer': 'women'},
 {'question_id': '202225914',
  'answer': 'USER: \nDoes the utensil on top of the table look clean and black?\nAnswer the question using a single word or phrase. ASSISTANT: No',
  'gt_answer': 'no'},
 {'question_id': '2062325',
  'answer': 'USER: \nIs the surfer that looks wet wearing a wetsuit?\nAnswer the question using a single word or phrase. ASSISTANT: Yes',
  'gt_answer': 'yes'},
 {'question_id': '201303229',
  'answer': 'USER: \nHow tall is the chair in the bottom of the photo?\nAnswer the question using a single word or phrase. ASSISTANT: Tall',
  'gt_answer': 'short'},
 {'question_id': '201902997',
  'answer': 'USER: \nWhat kind of device is on top of th

In [7]:
from scoring_pipeline import ScoringPipeline

for res in results:
    res['answer'] = res['answer'].split('ASSISTANT: ')[-1]

def compute_gqa_results(results, scorer, save_path=None):
    gqa_results = scorer.compute_scores(results, "gqa")
    print(gqa_results)
# if save_path:
#     with open(save_path, "w") as f:
#         json.dump(gqa_results, f)

scorer = ScoringPipeline()
compute_gqa_results(results, scorer)

Adding current path to python system paths
{'agg_metrics': 61.47, 'acc': 61.47}
