https://github.com/haotian-liu/LLaVA \
https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md

In [1]:
import os
import sys

In [2]:
# Append the LLaVA directory to the system path
# This allows us to import modules from the LLaVA repository
sys.path.append(os.path.join(os.getcwd(), "LLaVA"))

In [3]:
# Set CUDA_VISIBLE_DEVICES to expose only device 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Set CUDA_VISIBLE_DEVICES to expose devices 0, 1, 2, and 3
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [4]:
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

from PIL import Image
import math
import json
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]

In [6]:
disable_torch_init()

In [7]:
model_path = "liuhaotian/llava-v1.6-vicuna-7b"
model_path = os.path.expanduser(model_path)

In [8]:
model_base=None
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name)

You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.02s/it]
Some weights of LlavaLlamaForCausalLM were not initialized from the model checkpoint at liuhaotian/llava-v1.6-vicuna-7b and are newly initialized: ['model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv

In [9]:
question_file = "llava_questions.jsonl"
questions = [json.loads(q) for q in open(os.path.expanduser(question_file), "r")]

In [10]:
num_chunks=1
chunk_idx=0
questions = get_chunk(questions, num_chunks, chunk_idx)

In [11]:
answers_file = "./llava_answers.jsonl"
answers_file = os.path.expanduser(answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)

In [12]:
ans_file = open(answers_file, "w")

In [13]:
line = questions[0]

In [14]:
idx = line["question_id"]
image_file = line["image"]
qs = line["text"]
cur_prompt = qs

In [15]:
if model.config.mm_use_im_start_end:
    qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
    qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

In [16]:
qs

'<image>\nExplain the image in detail'

In [17]:
conv_mode = "llava_v1"
conv = conv_templates[conv_mode].copy()

In [18]:
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)

In [19]:
conv

Conversation(system="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", roles=('USER', 'ASSISTANT'), messages=[['USER', '<image>\nExplain the image in detail'], ['ASSISTANT', None]], offset=0, sep_style=<SeparatorStyle.TWO: 2>, sep=' ', sep2='</s>', version='v1', skip_next=False)

In [20]:
prompt = conv.get_prompt()
prompt

"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nExplain the image in detail ASSISTANT:"

In [21]:
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

In [22]:
input_ids

tensor([[    1,   319, 13563,  1546,   263, 12758,  5199,   322,   385, 23116,
         21082, 20255, 29889,   450, 20255,  4076,  8444, 29892, 13173, 29892,
           322,  1248,   568,  6089,   304,   278,  5199, 29915, 29879,  5155,
         29889,  3148,  1001, 29901, 29871,  -200, 29871,    13,  9544,  7420,
           278,  1967,   297,  9493,   319,  1799,  9047, 13566, 29901]],
       device='cuda:0')

In [23]:
image_folder="./"
image = Image.open(os.path.join(image_folder, image_file)).convert('RGB')

In [24]:
image_tensor = process_images([image], image_processor, model.config)[0]

In [25]:
with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=image_tensor.unsqueeze(0).half().cuda(),
        image_sizes=[image.size],
        do_sample=True if 0.2 > 0 else False,
        temperature=0.2,
        top_p=None,
        num_beams=1,
        # no_repeat_ngram_size=3,
        max_new_tokens=1024,
        use_cache=True)

In [26]:
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

In [27]:
outputs

"The image captures a serene moment in the wild, featuring two majestic giraffes in a grassy field. The giraffe on the left, slightly ahead of its companion, is facing the camera, its long neck and legs a testament to its towering height. Its coat, a beautiful mosaic of brown and white patches, stands out against the greenery of the field.\n\nThe second giraffe, positioned slightly behind the first, is facing away from the camera, its attention seemingly drawn to something in the distance. Its coat mirrors that of the first, with similar patches of brown and white.\n\nThe field they stand in is a lush green, providing a stark contrast to the giraffes' brown and white coats. In the background, you can see trees and bushes, adding depth to the scene and hinting at the expansive nature of their habitat.\n\nThe sky above is a clear blue, suggesting a bright and sunny day. The overall composition of the image, with the giraffes centrally placed and the field and trees in the background, cre

In [None]:
!export PYTHONPATH=/data/mn27889/path-open-data/LLaVA:$PYTHONPAT

In [None]:
!CUDA_VISIBLE_DEVICES=0 python llava/eval/model_vqa.py --model-path liuhaotian/llava-v1.6-vicuna-7b --image-folder=/data/mn27889/path-open-data/DeepS\
eek-VL2/images --question-file ./../llava_questions.jsonl --answers-file ./../llava_answers.jsonl