In [3]:
import pandas as pd
import json
import os
from PIL import Image
from tqdm.notebook import tqdm
from transformers import Qwen2VLForConditionalGeneration, PaliGemmaForConditionalGeneration, MllamaForConditionalGeneration, AutoModelForCausalLM, LlavaNextProcessor, LlavaNextForConditionalGeneration, AutoProcessor, AutoTokenizer, AutoModel
from pathlib import Path
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torchvision.transforms import InterpolationMode
from vllm import LLM
from vllm.sampling_params import SamplingParams
import gc
import re

In [4]:
with open("./my2DlinePlots/metadata.json", 'r') as f:
    metadata = json.load(f)

df = pd.DataFrame.from_dict(metadata, orient='index')

df = df.reset_index().rename(columns={'index': 'filename'})

df['image_path'] = df['filename'].apply(lambda x: os.path.join("./my2DlinePlots", x + ".png"))

df = df.drop(columns=['grid_size'])

# Add prompt column
prompts = {
    "prompt1": "How many times do the blue and red lines touch each other? Answer with a number in curly brackets, e.g., {5}.",
    "prompt2": "Count the intersection points where the blue and red lines meet. Put your answer in curly brackets, e.g., {2}."
}

# Duplicate rows for each prompt
data = pd.concat([df, df], ignore_index=True)
data['prompt'] = [prompts["prompt1"] if i % 2 == 0 else prompts["prompt2"] for i in range(len(data))]


model_names = [
    "Qwen/Qwen2-VL-7B-Instruct",
    "google/paligemma-3b-pt-448",
    "microsoft/Florence-2-large",
    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "OpenGVLab/InternVL2_5-8B-MPO",
    "microsoft/Phi-3.5-vision-instruct",
    "mistralai/Pixtral-12B-2409"
]

for model_name in model_names:
    data[model_name] = -1

data

Unnamed: 0,filename,gt,linewidth,resolution,distances,image_path,prompt,Qwen/Qwen2-VL-7B-Instruct,google/paligemma-3b-pt-448,microsoft/Florence-2-large,meta-llama/Llama-3.2-11B-Vision-Instruct,llava-hf/llava-v1.6-mistral-7b-hf,OpenGVLab/InternVL2_5-8B-MPO,microsoft/Phi-3.5-vision-instruct,mistralai/Pixtral-12B-2409
0,gt_1_image_0_thickness_2_resolution_384,1,2,100,"[1.0, 10.0, 0.0]",./my2DlinePlots/gt_1_image_0_thickness_2_resol...,How many times do the blue and red lines touch...,-1,-1,-1,-1,-1,-1,-1,-1
1,gt_1_image_0_thickness_4_resolution_384,1,4,100,"[1.0, 10.0, 0.0]",./my2DlinePlots/gt_1_image_0_thickness_4_resol...,Count the intersection points where the blue a...,-1,-1,-1,-1,-1,-1,-1,-1
2,gt_1_image_0_thickness_2_resolution_768,1,2,200,"[1.0, 10.0, 0.0]",./my2DlinePlots/gt_1_image_0_thickness_2_resol...,How many times do the blue and red lines touch...,-1,-1,-1,-1,-1,-1,-1,-1
3,gt_1_image_0_thickness_4_resolution_768,1,4,200,"[1.0, 10.0, 0.0]",./my2DlinePlots/gt_1_image_0_thickness_4_resol...,Count the intersection points where the blue a...,-1,-1,-1,-1,-1,-1,-1,-1
4,gt_1_image_0_thickness_2_resolution_1152,1,2,300,"[1.0, 10.0, 0.0]",./my2DlinePlots/gt_1_image_0_thickness_2_resol...,How many times do the blue and red lines touch...,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,gt_2_image_299_thickness_4_resolution_384,2,4,100,"[4.0, 6.0, 1.0]",./my2DlinePlots/gt_2_image_299_thickness_4_res...,Count the intersection points where the blue a...,-1,-1,-1,-1,-1,-1,-1,-1
3596,gt_2_image_299_thickness_2_resolution_768,2,2,200,"[4.0, 6.0, 1.0]",./my2DlinePlots/gt_2_image_299_thickness_2_res...,How many times do the blue and red lines touch...,-1,-1,-1,-1,-1,-1,-1,-1
3597,gt_2_image_299_thickness_4_resolution_768,2,4,200,"[4.0, 6.0, 1.0]",./my2DlinePlots/gt_2_image_299_thickness_4_res...,Count the intersection points where the blue a...,-1,-1,-1,-1,-1,-1,-1,-1
3598,gt_2_image_299_thickness_2_resolution_1152,2,2,300,"[4.0, 6.0, 1.0]",./my2DlinePlots/gt_2_image_299_thickness_2_res...,How many times do the blue and red lines touch...,-1,-1,-1,-1,-1,-1,-1,-1


In [33]:
def get_model_prediction(model, processor, image, prompt):
    if model_name == "Qwen/Qwen2-VL-7B-Instruct":
        conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        # Preprocess the inputs
        text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

        inputs = processor(
            text=[text_prompt], images=[image], padding=True, return_tensors="pt"
        )
        inputs = inputs.to("cuda")

        # Inference: Generation of the output
        output_ids = model.generate(**inputs, max_new_tokens=16)
        generated_ids = [
            output_ids[len(input_ids) :]
            for input_ids, output_ids in zip(inputs.input_ids, output_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        return output_text.strip("[]{}")
    
    elif model_name == "google/paligemma-3b-pt-448":
        # Process inputs and move to MPS
        model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
        
        input_len = model_inputs["input_ids"].shape[-1]

        # Generate output
        with torch.inference_mode():
            generation = model.generate(**model_inputs, max_new_tokens=16, do_sample=False)
            generation = generation[0][input_len:]
            decoded = processor.decode(generation, skip_special_tokens=True)
            return decoded
    
    elif model_name == "microsoft/Florence-2-large":
        # Process inputs and move to MPS
        inputs = processor(text=prompt, images=image, return_tensors="pt").to("mps", torch.float16)

        # Generate output
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=16,
            num_beams=3,
            do_sample=False
        )

        # Decode and post-process the output
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

        parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
        return parsed_answer
    
    elif model_name == "meta-llama/Llama-3.2-11B-Vision-Instruct":
        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": prompt}
            ]}
        ]
        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(
            image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt"
        ).to(model.device)

        output = model.generate(**inputs, max_new_tokens=32)
        raw = processor.decode(output[0])
        return raw
    
    elif model_name == "llava-hf/llava-v1.6-mistral-7b-hf":
        conversation = [
            {

            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image"},
                ],
            },
        ]
        prompt_processed = processor.apply_chat_template(conversation, add_generation_prompt=True)

        inputs = processor(images=image, text=prompt_processed, return_tensors="pt").to("mps")

        # autoregressively complete prompt
        output = model.generate(**inputs, max_new_tokens=16)

        out = processor.decode(output[0], skip_special_tokens=True)

        match = re.search(r"{(\d+)}", out)
        return int(match.group(1)) if match else out
    
    elif model_name == "OpenGVLab/InternVL2_5-8B-MPO":
        IMAGENET_MEAN = (0.485, 0.456, 0.406)
        IMAGENET_STD = (0.229, 0.224, 0.225)

        def build_transform(input_size):
            MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
            transform = T.Compose([
                T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
                T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
                T.ToTensor(),
                T.Normalize(mean=MEAN, std=STD)
            ])
            return transform

        def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
            best_ratio_diff = float('inf')
            best_ratio = (1, 1)
            area = width * height
            for ratio in target_ratios:
                target_aspect_ratio = ratio[0] / ratio[1]
                ratio_diff = abs(aspect_ratio - target_aspect_ratio)
                if ratio_diff < best_ratio_diff:
                    best_ratio_diff = ratio_diff
                    best_ratio = ratio
                elif ratio_diff == best_ratio_diff:
                    if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                        best_ratio = ratio
            return best_ratio

        def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
            orig_width, orig_height = image.size
            aspect_ratio = orig_width / orig_height

            # calculate the existing image aspect ratio
            target_ratios = set(
                (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
                i * j <= max_num and i * j >= min_num)
            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

            # find the closest aspect ratio to the target
            target_aspect_ratio = find_closest_aspect_ratio(
                aspect_ratio, target_ratios, orig_width, orig_height, image_size)

            # calculate the target width and height
            target_width = image_size * target_aspect_ratio[0]
            target_height = image_size * target_aspect_ratio[1]
            blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

            # resize the image
            resized_img = image.resize((target_width, target_height))
            processed_images = []
            for i in range(blocks):
                box = (
                    (i % (target_width // image_size)) * image_size,
                    (i // (target_width // image_size)) * image_size,
                    ((i % (target_width // image_size)) + 1) * image_size,
                    ((i // (target_width // image_size)) + 1) * image_size
                )
                # split the image
                split_img = resized_img.crop(box)
                processed_images.append(split_img)
            assert len(processed_images) == blocks
            if use_thumbnail and len(processed_images) != 1:
                thumbnail_img = image.resize((image_size, image_size))
                processed_images.append(thumbnail_img)
            return processed_images

        def load_image(image, input_size=448, max_num=12):
            transform = build_transform(input_size=input_size)
            images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
            pixel_values = [transform(image) for image in images]
            pixel_values = torch.stack(pixel_values)
            return pixel_values
        
        
        width, _ = image.size
        if width <= 384:  # 384x384
            num = 1
        elif width <= 768:  # 768x768
            num = 4
        else:  # 1152x1152
            num = 9

        # set the max number of tiles in `max_num`
        pixel_values = load_image(image, max_num=num)
        pixel_values = pixel_values.to("cpu", torch.float16)
        generation_config = dict(max_new_tokens=16, do_sample=True)
    
        question = '<image>\n' + prompt
        response = model.chat(processor, pixel_values, question, generation_config)
        return response.strip("[]{}")

    elif model_name == "microsoft/Phi-3.5-vision-instruct":    
        message = [
            {"role": "user", "content": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n"},
        ]

        prompt = processor.tokenizer.apply_chat_template(
        message, 
        tokenize=False, 
        add_generation_prompt=True
        )

        inputs = processor(prompt, image, return_tensors="pt").to("cpu") 

        generation_args = { 
            "max_new_tokens": 16, 
            "temperature": 0.0, 
            "do_sample": False, 
        } 

        generate_ids = model.generate(**inputs, 
        eos_token_id=processor.tokenizer.eos_token_id, 
        **generation_args
        ).to("cpu")

        # remove input tokens 
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        response = processor.batch_decode(generate_ids, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False)[0] 

        return response.strip("[]{}")
    
    elif model_name == "mistralai/Pixtral-12B-2409":
        messages = [
            {
                "role": "user",
                "content": [{"type": "text", "text": prompt}, {"type": "image", "image": image}]
            },
        ]

        outputs = model.chat(messages, sampling_params=processor)

        return outputs[0].outputs[0].text


In [34]:
def get_huggingface_token():
    # Define the path to the token file (updated path)
    token_file = Path.home() / ".cache" / "huggingface" / "token"

    # Check if the token file exists
    if token_file.exists():
        with open(token_file, "r") as file:
            token = file.read().strip()
            return token
    else:
        raise FileNotFoundError("Hugging Face token file not found. Please run 'huggingface-cli login'.")

In [None]:
for model_name in model_names:
    if model_name == "Qwen/Qwen2-VL-7B-Instruct":
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name, torch_dtype="auto", device_map="auto"
        )
        processor = AutoProcessor.from_pretrained(model_name)
    
    elif model_name == "google/paligemma-3b-pt-448":
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="mps",
            revision="float16",  # Use float16 revision for better compatibility
            token = get_huggingface_token()
        ).eval()
        processor = AutoProcessor.from_pretrained(model_name)

    elif model_name == "microsoft/Florence-2-large":
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True
        ).to("mps")

        processor = AutoProcessor.from_pretrained(
            model_name,
            trust_remote_code=True
        )
   
    elif model_name == "meta-llama/Llama-3.2-11B-Vision-Instruct":
        model = MllamaForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        processor = AutoProcessor.from_pretrained(model_name)

    elif model_name == "llava-hf/llava-v1.6-mistral-7b-hf":
        processor = LlavaNextProcessor.from_pretrained(model_name)
        model = LlavaNextForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True) 
        model.to("mps")

    elif model_name == "OpenGVLab/InternVL2_5-8B-MPO":
        model = AutoModel.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to('cpu').eval()
    
        processor = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)

    elif model_name == "microsoft/Phi-3.5-vision-instruct":
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            device_map="cpu", 
            trust_remote_code=True, 
            torch_dtype=torch.float16, 
            _attn_implementation='eager',
            low_cpu_mem_usage=True
        )

        # for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
        processor = AutoProcessor.from_pretrained(model_name, 
        trust_remote_code=True, 
        num_crops=16
        )
    
    elif model_name == "mistralai/Pixtral-12B-2409":
        processor = SamplingParams(max_tokens=16)
        model = LLM(model=model_name, tokenizer_mode="mistral", device="cpu")

    # Iterate through the DataFrame rows
    for index, row in tqdm(data.iterrows(), total=len(data), desc=f"{model_name}"):
        image_path = row['image_path']
        prompt = row['prompt']

        # Load the image
        image = Image.open(image_path)
        
        # Get the model prediction for the image and prompt using model specific inference
        prediction = get_model_prediction(model, processor, image, prompt) 

        # Store the prediction in the DataFrame
        data.loc[index, model_name] = prediction

    # delete model and free memory
    del model
    del processor

    gc.collect()

In [None]:
# after data is done we can run metrics using the dataframe