In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

MODEL_PATH = "/media/data2/mgaliazzo/"
os.environ['HF_HOME'] = MODEL_PATH
os.environ['HF_DATASETS_CACHE'] = MODEL_PATH
os.environ['TRANSFORMERS_CACHE'] = MODEL_PATH

from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle

from PIL import Image
import copy
import torch
from tqdm import tqdm
import json

pretrained = "lmms-lab/llava-critic-72b"
model_name = "llava_qwen"
device = "cuda"
device_map = "auto"

tokenizer, model, image_processor, max_length = load_pretrained_model(
    pretrained, 
    None, 
    model_name, 
    device_map=device_map,
    load_4bit=True
)

model.eval()

  from .autonotebook import tqdm as notebook_tqdm


Please install pyav to use video processing functions.




Loaded LLaVA model: lmms-lab/llava-critic-72b


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type llava to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.
Downloading shards: 100%|██████████| 31/31 [00:00<00:00, 13885.46it/s]


Loading vision tower: google/siglip-so400m-patch14-384


Loading checkpoint shards: 100%|██████████| 31/31 [00:42<00:00,  1.38s/it]


Model Class: LlavaQwenForCausalLM


LlavaQwenForCausalLM(
  (model): LlavaQwenModel(
    (embed_tokens): Embedding(151647, 8192)
    (layers): ModuleList(
      (0-79): 80 x Qwen2DecoderLayer(
        (self_attn): Qwen2FlashAttention2(
          (q_proj): Linear4bit(in_features=8192, out_features=8192, bias=True)
          (k_proj): Linear4bit(in_features=8192, out_features=1024, bias=True)
          (v_proj): Linear4bit(in_features=8192, out_features=1024, bias=True)
          (o_proj): Linear4bit(in_features=8192, out_features=8192, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=8192, out_features=29568, bias=False)
          (up_proj): Linear4bit(in_features=8192, out_features=29568, bias=False)
          (down_proj): Linear4bit(in_features=29568, out_features=8192, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (no

In [2]:
image_names = ['000.png', '010.png', '015.png', '020.png']
data_root = "media7link/gpt4point_test/"

object_ids = [
    "02797d5feaac4ccabfdf8b357fa2a13a",
]

llava_prompt = """You are a meticulous and precise visual analyst. Your task is to generate a single, factual, and objective paragraph describing a scene. The description's primary focus must be the spatial arrangement of the objects within it.

### Core Principles:
1.  Describe, Don't Interpret: Report only what you see. Do not infer actions, intentions, or history. Stick to concrete, observable facts.
2.  Focus on Spatial Relationships: While object attributes like color and shape are important for identification, the paragraph must be structured around *where things are* in relation to one another. Use clear prepositions (e.g. "on top of", "in front of", "next to") without using ambiguous terms such as "to the left/right of".
3. No Speculation: Avoid making assumptions. If you are uncertain about a material, describe its visual properties (e.g. "a dark, textured wood") rather than guessing a specific type (e.g. "oak"). If you cannot identify an object with certainty, describe its shape and color.
4. Literal and Unimaginative: Your goal is to be a camera, not a storyteller. Avoid creating a narrative or setting a mood. Stick to concrete, observable facts.

### Task:
Based on the provided image(s) and the principles above, generate a single, detailed paragraph. The paragraph should identify the Primary Objects, their key visual attributes for identification, and, most importantly, their spatial layout and relationships to each other.
Please analyze the scene using the given instructions."""

llava_response = ""

DESCRIPTIONS_JSON_PATH = "llava72_desc_qna"
def load_json(file_path):
    with open(file_path, 'r') as file: return json.load(file)

descriptions = load_json(f"{DESCRIPTIONS_JSON_PATH}.json").get("items", [])

In [4]:
for item in tqdm(descriptions):
    images_paths = [os.path.join(data_root, item.get("item_id", ""), img_name) for img_name in image_names]

    images = []
    for image_path in images_paths:
        image = Image.open(image_path).convert("RGB")
        images.append(image)
    
    image_tensor = process_images(images, image_processor, model.config)
    image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]

    conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models

    llava_response = item.get("augmented_description", "")

    # pointwise scoring
    critic_prompt = f"Given an image and a corresponding question, please serve as an unbiased and fair judge to evaluate the quality of answer answers provided by a Large Multimodal Model (LMM). Score the response out of 100 and explain your reasoning with specific details.\nQuestion: {llava_prompt}\nThe LMM response: {llava_response}\n. Please evaluate the quality of this answer."

    image_tokens = [DEFAULT_IMAGE_TOKEN] * len(images)
    question = "".join(image_tokens) + "\n" + critic_prompt

    # question = DEFAULT_IMAGE_TOKEN + "\n" + critic_prompt
    conv = copy.deepcopy(conv_templates[conv_template])
    conv.append_message(conv.roles[0], question)
    conv.append_message(conv.roles[1], None)
    prompt_question = conv.get_prompt()

    input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
    image_sizes = [img.size for img in images]

    

    cont = model.generate(
        input_ids,
        images=image_tensor,
        image_sizes=image_sizes,
        do_sample=False,
        temperature=0,
        max_new_tokens=4096,
    )
    text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
    print(text_outputs[0])
    print(f"output len: {len(text_outputs)}")

    item["critic"] = text_outputs[0]

  0%|          | 0/26 [00:30<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 864.00 MiB. GPU 2 has a total capacty of 23.68 GiB of which 564.94 MiB is free. Including non-PyTorch memory, this process has 23.12 GiB memory in use. Of the allocated memory 21.71 GiB is allocated by PyTorch, and 1.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF