In [1]:

import argparse
import os
import sys
sys.path.append(os.path.abspath(".."))
from dataset_zoo import Controlled_Images
import gradio as gr
import torch
from accelerate import Accelerator
from huggingface_hub import HfFolder
from peft import PeftModel
from PIL import Image as PIL_Image
from transformers import MllamaForConditionalGeneration, MllamaProcessor

# Initialize accelerator
accelerator = Accelerator()
device = accelerator.device
def load_model_and_processor(model_name: str, finetuning_path: str = None):
    """Load model and processor with optional LoRA adapter"""
    print(f"Loading model: {model_name}")
    hf_token = get_hf_token()
    model = MllamaForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        use_safetensors=True,
        device_map=device,
        # token=hf_token,
    )
    processor = MllamaProcessor.from_pretrained(
        model_name,  use_safetensors=True
    )

    if finetuning_path and os.path.exists(finetuning_path):
        print(f"Loading LoRA adapter from '{finetuning_path}'...")
        model = PeftModel.from_pretrained(
            model, finetuning_path, is_adapter=True, torch_dtype=torch.bfloat16
        )
        print("LoRA adapter merged successfully")

    model, processor = accelerator.prepare(model, processor)
    return model, processor

  from .autonotebook import tqdm as notebook_tqdm
Detected kernel version 5.4.250, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [2]:
model_name = '/lpai/dataset/llava-cot-model/0-0-2/Llama-3.2-11B-Vision-Instruct'
processor = MllamaProcessor.from_pretrained(
    model_name,  use_safetensors=True
)
controlled_a = Controlled_Images(image_preprocess=processor, subset="A", download=False)  
controlled_b = Controlled_Images(image_preprocess=processor, subset="B", download=False)

/lpai/volumes/ssai-xtuber-vol-lf/yuhaofu/eval/whatsup_vlms/data/controlled_images


In [3]:
print(controlled_a[0])

{'image_options': [{'pixel_values': array([[[[[[ 1.0982260e+00,  1.1128243e+00,  1.0982260e+00, ...,
             1.4485881e+00,  1.5069818e+00,  1.4777850e+00],
           [ 1.0690291e+00,  1.1128243e+00,  1.1274228e+00, ...,
             1.4485881e+00,  1.4777850e+00,  1.4923834e+00],
           [ 1.0982260e+00,  1.1274228e+00,  1.1128243e+00, ...,
             1.4777850e+00,  1.4777850e+00,  1.4485881e+00],
           ...,
           [ 5.2888733e-01,  5.1428890e-01,  5.1428890e-01, ...,
            -8.5796320e-01, -8.5796320e-01, -9.0175849e-01],
           [ 5.4348576e-01,  5.2888733e-01,  5.1428890e-01, ...,
            -8.5796320e-01, -8.8716006e-01, -8.8716006e-01],
           [ 5.5808419e-01,  5.2888733e-01,  5.1428890e-01, ...,
            -8.5796320e-01, -9.1635692e-01, -9.1635692e-01]],

          [[ 1.1293944e+00,  1.1444021e+00,  1.1293944e+00, ...,
             1.4745730e+00,  1.5346041e+00,  1.5045886e+00],
           [ 1.0993788e+00,  1.1444021e+00,  1.1594099e+00, ...,

In [3]:
import re
text = '*Answer*: A<|eot_id|>'
answer_pattern = r"\*Answer\*:?[\s](.*?)<\|eot_id\|>"
answer_match = re.search(answer_pattern, text, re.DOTALL | re.IGNORECASE)
if answer_match:
    conclusion_text = answer_match.group(1).strip()
    print(conclusion_text)
    print(conclusion_text.strip().upper().startswith("A"))

A
True


In [4]:
result_text = "The image shows a can positioned directly in front of a knife, with the can's label facing the camera and the knife's blade pointing towards the bottom of the image. This arrangement suggests that the can is placed in front of the knife. Correct option: B. A can in front of a knife<|eot_id|>"
correct_option_patterns = [
        r"\*\*Correct option:\*\*\s*([A-Da-d][\.\):]?(.*?))($|\n|\.|\,)",
        r"Correct option:\s*([A-Da-d][\.\):]?(.*?))($|\n|\.|\,)"
    ]
for pat in correct_option_patterns:
    m = re.search(pat, result_text, re.IGNORECASE)
    if m:
        conclusion_text =  m.group(1).strip()
        print(conclusion_text)
        print(conclusion_text.strip().upper().startswith("A"))

B. A can in front of a knife<|eot_id|>
False


In [None]:
from PIL import Image
import 
img_path = 'data/controlled_images/wineglass_right_of_chair.jpeg'
img = Image.open(img_path)
img.show()