## Load Food Dataset

In [18]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json

In [19]:
# Data Directory: 
# food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/stephen/car_data.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,url,captions
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]"


In [20]:
df.shape

(100, 4)

## Generate Multiple Choice Question

In [21]:
# Generate random choice in [A, B, C, D]
import random

def generate_random_choice():
    return random.choice(['A', 'B', 'C', 'D'])

In [22]:
df['multiple_choice_solution'] = df.apply(lambda x: generate_random_choice(), axis=1)

In [23]:
df.head()

Unnamed: 0,id,file_name,url,captions,multiple_choice_solution
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]",C
1,315713,COCO_train2014_000000315713.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000315713.jpg,"[A young girl about to bite into a hotdog dotted with mustard and ketchup, A girl makes a face as she eats a hot dog., A little girl is attempting to eat a hot dog., THERE IS A GIRL THAT IS EATING A HOT DOG , A girl getting ready to bite into a hotdog ]",C
2,309791,COCO_train2014_000000309791.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000309791.jpg,"[A street on the side of which shops are decorated and flowering potted plants are kept., An outdoor city market selling trees, shrubs, and flowers., People at an outdoor market under a canopy., a market area with plants and people standing by, Market stands in city center with Christmas decorations.]",C
3,496434,COCO_train2014_000000496434.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000496434.jpg,"[A woman standing a tennis court holding a tennis racquet., A tennis player playing tennis in a tennis court., A woman on a tennis court serving the ball., a woman looking up at a tennis ball, A woman reaches up toward a tennis ball]",B
4,567630,COCO_train2014_000000567630.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000567630.jpg,"[A couple of girls with tennis rackets in a room., A couple of girls holding tennis racquets and a ball., Two young girls standing next to each other with racquets., The two girls are getting ready to play tennis., Two girlw holding tennis rackets in a room]",B


In [24]:
import os
from openai import OpenAI
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
)


def generate_multiple_choice_question(reference_caption, correct_choice, level='medium'): 
    # Define the prompt to generate inferior choices
    if level == 'easy':
        level_message = "The distractors are obviously incorrect but still loosely related to the context."
    elif level == 'medium':
        level_message = "The distractors are somewhat related to the context but contain inaccuracies or non-fluent language."
    elif level == 'hard':
        level_message = "The distractors are closely related to the context but may confuse someone without careful observation."


    prompt = f"""
    Given the ground truth caption below:
    "{reference_caption}"
    Generate three plausible but incorrect distractors.
    "{level_message}"
    Format the result as a multiple-choice question. 
    Question title should be "Which of the following captions best describes the painting?".
    The correct choice should be placed at choice "{correct_choice}". 
    Do not generate special symbols such as '*'.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=200,
    )

    # Extract the generated multiple-choice question
    question = response.choices[0].message.content    
    
    return question

In [None]:
# generate hard question 
df['multiple_choice_question_hard'] = df.apply(lambda x: generate_multiple_choice_question(x['captions'][0], x['multiple_choice_solution'], level='hard'), axis=1)

In [None]:
# generate medium question
df['multiple_choice_question_medium'] = df.apply(lambda x: generate_multiple_choice_question(x['captions'][0], x['multiple_choice_solution'], level='medium'), axis=1)

In [None]:
# generate easy question
df['multiple_choice_question_easy'] = df.apply(lambda x: generate_multiple_choice_question(x['captions'][0], x['multiple_choice_solution'], level='easy'), axis=1)

In [None]:
df.head(1)

In [None]:
df.iloc[0].multiple_choice_question_hard

In [None]:
# Save the annotation with multiple choice question to output file

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/stephen/people_annotation_with_MCQ_3_difficulies.json"
with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

## Perform Multiple Choice Selection

In [25]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = "/shared/data/stephen/people_annotation_with_MCQ_3_difficulies.json"


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,url,captions,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]",D,Question: Which of the following captions best describes the painting?\n\nA) A group of people riding horses across a field.\n\nB) A family having a picnic in a park.\n\nC) A group of people rowing a boat on a river.\n\nD) A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A group of children playing in a park with a large ball.\n\nB. Several tourists sitting inside a colorful hot air balloon.\n\nC. A family gathered around a table enjoying dinner together.\n\nD. A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A family having a picnic under a large oak tree.\n\nB. A group of friends playing soccer in a park.\n\nC. A couple dancing under the stars at a wedding.\n\nD. A group of people riding on the back of an elephant.


### Use llava model

In [26]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests 

In [31]:
# Load model from local directory 
model_path = '/shared/model/llava-v1.6-mistral-7b-hf'

processor = LlavaNextProcessor.from_pretrained(model_path)

model = LlavaNextForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, load_in_4bit=True) 
#model.to("cuda:0")

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Without image augmentation

In [32]:
from PIL import Image
import requests
from io import BytesIO

def perform_multiple_choice_task_llava(img_url, question):
    # 下载图片
    response = requests.get(img_url)
    response.raise_for_status()  # 检查下载是否成功
    image = Image.open(BytesIO(response.content))  # 将图片加载到内存中

    # 构建对话
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
                {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # 自动回归生成结果
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    # 提取答案
    mcq_answer = output.split('[/INST]')[1].strip()
    return mcq_answer


In [33]:
df['multiple_choice_prediction_easy'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['url'], x['multiple_choice_question_easy']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

In [34]:
df['multiple_choice_prediction_medium'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['url'], x['multiple_choice_question_medium']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [35]:
df['multiple_choice_prediction_hard'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['url'], x['multiple_choice_question_hard']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [36]:
df.head(1)

Unnamed: 0,id,file_name,url,captions,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy,multiple_choice_prediction_medium,multiple_choice_prediction_hard
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]",D,Question: Which of the following captions best describes the painting?\n\nA) A group of people riding horses across a field.\n\nB) A family having a picnic in a park.\n\nC) A group of people rowing a boat on a river.\n\nD) A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A group of children playing in a park with a large ball.\n\nB. Several tourists sitting inside a colorful hot air balloon.\n\nC. A family gathered around a table enjoying dinner together.\n\nD. A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A family having a picnic under a large oak tree.\n\nB. A group of friends playing soccer in a park.\n\nC. A couple dancing under the stars at a wedding.\n\nD. A group of people riding on the back of an elephant.,D,D,D


In [37]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties.json


In [38]:
def calculate_multiple_choice_question_accuracy(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [39]:
calculate_multiple_choice_question_accuracy(df)

Prediction Accuracy Easy: 100.00%
Prediction Accuracy Medium: 89.00%
Prediction Accuracy Hard: 81.00%


(1.0, 0.89, 0.81)

#### With Image Augmentation

In [40]:
# This function will apply the augmentation to the image
# sin_aug: single augmentation, includes flip, rotate, crop, saturation, artStyle, noise, blur
# mul_aug: multiple augmentations in a list
# Return a dictionary. Key: augmentation name; Value: augmented image
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random

def image_augumentation(ori_img, sin_aug = None, mul_aug = None):
    img = ori_img.copy()
    aug_img = {}
    if sin_aug:
        if sin_aug == 'flip':
            aug_img['flip'] = flip_image(img)
        elif sin_aug == 'rotate':
            aug_img['rotate'] = rotate_image(img)
        elif sin_aug == 'crop':
            aug_img['crop'] = random_crop(img)
        elif sin_aug == 'saturation':
            aug_img['saturation'] = adjust_saturation(img)
        elif sin_aug == 'artStyle':
            aug_img['artStyle'] = convert_to_artStyle(img)
        elif sin_aug == 'noise':
            aug_img['noise'] = add_noise(img)
        elif sin_aug == 'blur':
            aug_img['blur'] = blur_image(img)
        else:
            aug_img['original'] = img

    elif mul_aug:
        for aug in mul_aug:
            if aug == 'flip':
                aug_img['flip'] = flip_image(img)
            elif aug == 'rotate':
                aug_img['rotate'] = rotate_image(img)
            elif aug == 'crop':
                aug_img['crop'] = random_crop(img)
            elif aug == 'saturation':
                aug_img['saturation'] = adjust_saturation(img)
            elif aug == 'artStyle':
                aug_img['artStyle'] = convert_to_artStyle(img)
            elif aug == 'noise':
                aug_img['noise'] = add_noise(img)
            elif aug == 'blur':
                aug_img['blur'] = blur_image(img)
            else:
                aug_img['original'] = img
    else:
        aug_img['original'] = img
    return aug_img

def flip_image(image):
    img = np.flip(image, axis=1)
    return img

def rotate_image(image, angle_range=(-30, 30)):
    angle = random.uniform(*angle_range)
    h, w = image.shape[:2]
    center = (w // 2, h // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale=1.0)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR)
    return rotated_image

def random_crop(image, percent=0.7):
    h, w = image.shape[:2]
    # crop_h, crop_w = crop_size
    crop_h = round(percent * h)
    crop_w = round(percent * w)

    top = random.randint(0, h - crop_h)
    left = random.randint(0, w - crop_w)
    cropped_image = image[top:top + crop_h, left:left + crop_w]
    return cropped_image

def adjust_saturation(image, factor=5):

    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    enhancer = ImageEnhance.Color(image_pil)
    saturated_image = enhancer.enhance(factor)
    return cv2.cvtColor(np.array(saturated_image), cv2.COLOR_RGB2BGR)

def convert_to_artStyle(image):
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold
    art_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 11, 2)
    return art_image

def add_noise(image, mean=0, stddev=25):
    noise = np.random.normal(mean, stddev, image.shape).astype(np.uint8)
    noisy_image = cv2.add(image, noise)
    return noisy_image

def blur_image(image, kernel_size=(5, 5)):
    return cv2.GaussianBlur(image, kernel_size, 0)

In [43]:
# helper function: 
def perform_multiple_choice_task_with_image_augmentation_llava(img_url, question, aug_type=None):
    response = requests.get(img_url)
    response.raise_for_status()  # 检查下载是否成功
    image = Image.open(BytesIO(response.content))  # 将图片加载到内存中

    # Convert the PIL Image to a NumPy array
    image = np.array(image)

    # Apply image augmentation
    if (aug_type is not None):
        image = image_augumentation(image, sin_aug = aug_type)[aug_type]  
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    # mcq_question = output.split('[/INST]')[0].split('[INST] ')[1].strip()
    mcq_answer = output.split('[/INST]')[1].strip()
    return mcq_answer

##### Cropping

In [44]:
# Cropping - easy
df['multiple_choice_prediction_easy_crop'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_easy'], 'crop'), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [45]:
# Cropping - medium
df['multiple_choice_prediction_medium_crop'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_medium'], 'crop'), axis=1)
# Cropping - hard
df['multiple_choice_prediction_hard_crop'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_hard'], 'crop'), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

##### saturation

In [46]:
# Saturation - easy
df['multiple_choice_prediction_easy_saturation'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_easy'], 'saturation'), axis=1)
# Saturation - medium
df['multiple_choice_prediction_medium_saturation'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_medium'], 'saturation'), axis=1)
# Saturation - hard
df['multiple_choice_prediction_hard_saturation'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_hard'], 'saturation'), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

##### Noise

In [47]:
# Noise - easy
df['multiple_choice_prediction_easy_noise'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_easy'], 'noise'), axis=1)
# Noise - medium
df['multiple_choice_prediction_medium_noise'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_medium'], 'noise'), axis=1)
# Noise - hard
df['multiple_choice_prediction_hard_noise'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['url'], x['multiple_choice_question_hard'], 'noise'), axis=1)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

##### Save the result

In [48]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation.json


##### Evaluation

In [49]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,url,captions,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy,multiple_choice_prediction_medium,multiple_choice_prediction_hard,multiple_choice_prediction_easy_crop,multiple_choice_prediction_medium_crop,multiple_choice_prediction_hard_crop,multiple_choice_prediction_easy_saturation,multiple_choice_prediction_medium_saturation,multiple_choice_prediction_hard_saturation,multiple_choice_prediction_easy_noise,multiple_choice_prediction_medium_noise,multiple_choice_prediction_hard_noise
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]",D,Question: Which of the following captions best describes the painting?\n\nA) A group of people riding horses across a field.\n\nB) A family having a picnic in a park.\n\nC) A group of people rowing a boat on a river.\n\nD) A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A group of children playing in a park with a large ball.\n\nB. Several tourists sitting inside a colorful hot air balloon.\n\nC. A family gathered around a table enjoying dinner together.\n\nD. A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A family having a picnic under a large oak tree.\n\nB. A group of friends playing soccer in a park.\n\nC. A couple dancing under the stars at a wedding.\n\nD. A group of people riding on the back of an elephant.,D,D,D,D,D,D,D,D,D,D,D,D


In [50]:
def calculate_multiple_choice_question_accuracy_with_augmentation(df):
    # Calculate accuracy

    # crop augmentation
    accuracy_easy_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_crop"]).mean()
    accuracy_medium_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_crop"]).mean()
    accuracy_hard_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_crop"]).mean()

    # saturation augmentation
    accuracy_easy_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_saturation"]).mean()
    accuracy_medium_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_saturation"]).mean()
    accuracy_hard_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_saturation"]).mean()

    # noise augmentation
    accuracy_easy_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_noise"]).mean()
    accuracy_medium_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_noise"]).mean()
    accuracy_hard_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_noise"]).mean()


    print('***** Prediction Accuracy with Crop Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_crop * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium_crop * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_crop * 100:.2f}%")

    print('***** Prediction Accuracy with Saturation Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_saturation * 100:.2f}%")

    print('***** Prediction Accuracy with Noise Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_noise * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_noise * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_noise * 100:.2f}%")


    return accuracy_easy_crop, accuracy_medium_crop, accuracy_hard_crop, accuracy_easy_saturation, accuracy_medium_saturation, accuracy_hard_saturation, accuracy_easy_noise, accuracy_medium_noise, accuracy_hard_noise

In [51]:
calculate_multiple_choice_question_accuracy_with_augmentation(df)

***** Prediction Accuracy with Crop Augmentation *****
Prediction Accuracy Easy: 97.00%
Prediction Accuracy Medium: 86.00%
Prediction Accuracy Hard: 83.00%
***** Prediction Accuracy with Saturation Augmentation *****
Prediction Accuracy Easy: 100.00%
Prediction Accuracy Medium: 88.00%
Prediction Accuracy Hard: 83.00%
***** Prediction Accuracy with Noise Augmentation *****
Prediction Accuracy Easy: 89.00%
Prediction Accuracy Medium: 78.00%
Prediction Accuracy Hard: 72.00%


(0.97, 0.86, 0.83, 1.0, 0.88, 0.83, 0.89, 0.78, 0.72)

### Use Phi

In [52]:
# Load question dataframe
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json

# Data Directory: 
food_annotation_file_path = '/shared/data/stephen/people_annotation_with_MCQ_3_difficulies.json'

df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,url,captions,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]",D,Question: Which of the following captions best describes the painting?\n\nA) A group of people riding horses across a field.\n\nB) A family having a picnic in a park.\n\nC) A group of people rowing a boat on a river.\n\nD) A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A group of children playing in a park with a large ball.\n\nB. Several tourists sitting inside a colorful hot air balloon.\n\nC. A family gathered around a table enjoying dinner together.\n\nD. A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A family having a picnic under a large oak tree.\n\nB. A group of friends playing soccer in a park.\n\nC. A couple dancing under the stars at a wedding.\n\nD. A group of people riding on the back of an elephant.


In [53]:
from PIL import Image 
import requests 
from transformers import AutoModelForCausalLM 
from transformers import AutoProcessor 

model_id = "/shared/model/Phi-3.5-vision-instruct" 

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='eager'    
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



#### Without image augmentation

In [54]:
# helper function: 
from PIL import Image
import requests
from io import BytesIO

def perform_multiple_choice_task_Phi(img_url, question):
     # 下载图片
    response = requests.get(img_url)
    response.raise_for_status()  # 检查下载是否成功
    image = Image.open(BytesIO(response.content))  # 将图片加载到内存中

    images = []
    images.append(image)

    messages = [
        {"role": "user", "content": "<|image_1|>\n" + question + "\nOnly return the correct choice with a single letter."},
    ]

    prompt = processor.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 

    generation_args = { 
        "max_new_tokens": 10, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 

    generate_ids = model.generate(**inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
    )

    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0] 

    return response


# def perform_multiple_choice_task_llava(img_url, question):
#     # 下载图片
#     response = requests.get(img_url)
#     response.raise_for_status()  # 检查下载是否成功
#     image = Image.open(BytesIO(response.content))  # 将图片加载到内存中

#     # 构建对话
#     conversation = [
#         {
#             "role": "user",
#             "content": [
#                 {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
#                 {"type": "image"},
#             ],
#         },
#     ]
#     prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

#     inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

#     # 自动回归生成结果
#     output = model.generate(**inputs, max_new_tokens=150)
#     output = processor.decode(output[0], skip_special_tokens=True)

#     # 提取答案
#     mcq_answer = output.split('[/INST]')[1].strip()
#     return mcq_answer


In [55]:
df['multiple_choice_prediction_easy_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['url'], x['multiple_choice_question_easy']), axis=1)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


In [56]:
df['multiple_choice_prediction_medium_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['url'], x['multiple_choice_question_medium']), axis=1)



In [57]:
df['multiple_choice_prediction_hard_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['url'], x['multiple_choice_question_hard']), axis=1)



In [None]:
# # Save the prediction results to output file
# # Convert DataFrame to a list of dictionaries
# list_of_dicts = df.to_dict(orient="records")

# # Save the list of dictionaries to a JSON file
# output_file = "llava_prediction_result_food_image.json"
# with open(output_file, "w") as file:
#     json.dump(list_of_dicts, file, indent=4)

# print(f"DataFrame saved as a list of dictionaries in {output_file}")

In [58]:
# Evaluate the performance of the model
def calculate_multiple_choice_question_accuracy_Phi(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_Phi"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_Phi"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_Phi"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [59]:
calculate_multiple_choice_question_accuracy_Phi(df)

Prediction Accuracy Easy: 98.00%
Prediction Accuracy Medium: 93.00%
Prediction Accuracy Hard: 90.00%


(0.98, 0.93, 0.9)

#### With Image Augmentation

In [60]:
# This function will apply the augmentation to the image
# sin_aug: single augmentation, includes flip, rotate, crop, saturation, artStyle, noise, blur
# mul_aug: multiple augmentations in a list
# Return a dictionary. Key: augmentation name; Value: augmented image
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random

def image_augumentation(ori_img, sin_aug = None, mul_aug = None):
    img = ori_img.copy()
    aug_img = {}
    if sin_aug:
        if sin_aug == 'flip':
            aug_img['flip'] = flip_image(img)
        elif sin_aug == 'rotate':
            aug_img['rotate'] = rotate_image(img)
        elif sin_aug == 'crop':
            aug_img['crop'] = random_crop(img)
        elif sin_aug == 'saturation':
            aug_img['saturation'] = adjust_saturation(img)
        elif sin_aug == 'artStyle':
            aug_img['artStyle'] = convert_to_artStyle(img)
        elif sin_aug == 'noise':
            aug_img['noise'] = add_noise(img)
        elif sin_aug == 'blur':
            aug_img['blur'] = blur_image(img)
        else:
            aug_img['original'] = img

    elif mul_aug:
        for aug in mul_aug:
            if aug == 'flip':
                aug_img['flip'] = flip_image(img)
            elif aug == 'rotate':
                aug_img['rotate'] = rotate_image(img)
            elif aug == 'crop':
                aug_img['crop'] = random_crop(img)
            elif aug == 'saturation':
                aug_img['saturation'] = adjust_saturation(img)
            elif aug == 'artStyle':
                aug_img['artStyle'] = convert_to_artStyle(img)
            elif aug == 'noise':
                aug_img['noise'] = add_noise(img)
            elif aug == 'blur':
                aug_img['blur'] = blur_image(img)
            else:
                aug_img['original'] = img
    else:
        aug_img['original'] = img
    return aug_img

def flip_image(image):
    img = np.flip(image, axis=1)
    return img

def rotate_image(image, angle_range=(-30, 30)):
    angle = random.uniform(*angle_range)
    h, w = image.shape[:2]
    center = (w // 2, h // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale=1.0)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR)
    return rotated_image

def random_crop(image, percent=0.7):
    h, w = image.shape[:2]
    # crop_h, crop_w = crop_size
    crop_h = round(percent * h)
    crop_w = round(percent * w)

    top = random.randint(0, h - crop_h)
    left = random.randint(0, w - crop_w)
    cropped_image = image[top:top + crop_h, left:left + crop_w]
    return cropped_image

def adjust_saturation(image, factor=5):

    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    enhancer = ImageEnhance.Color(image_pil)
    saturated_image = enhancer.enhance(factor)
    return cv2.cvtColor(np.array(saturated_image), cv2.COLOR_RGB2BGR)

def convert_to_artStyle(image):
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold
    art_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 11, 2)
    return art_image

def add_noise(image, mean=0, stddev=25):
    noise = np.random.normal(mean, stddev, image.shape).astype(np.uint8)
    noisy_image = cv2.add(image, noise)
    return noisy_image

def blur_image(image, kernel_size=(5, 5)):
    return cv2.GaussianBlur(image, kernel_size, 0)

In [61]:
def perform_multiple_choice_task_with_image_augmentation_phi(img_url, question, aug_type=None):
     # 下载图片
    response = requests.get(img_url)
    response.raise_for_status()  # 检查下载是否成功
    image = Image.open(BytesIO(response.content))  # 将图片加载到内存中

    # Convert the PIL Image to a NumPy array 
    image = np.array(image)

    # Apply image augmentation
    if (aug_type is not None):
        image = image_augumentation(image, sin_aug = aug_type)[aug_type]  
    
    image = Image.fromarray(image)

    images = []
    images.append(image)

    messages = [
        {"role": "user", "content": "<|image_1|>\n" + question + "\nOnly return the correct choice with a single letter."},
    ]

    prompt = processor.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 

    generation_args = { 
        "max_new_tokens": 10, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 

    generate_ids = model.generate(**inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
    )

    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0] 

    return response


##### Cropping

In [62]:
# apply cropping augmentation
df['multiple_choice_prediction_easy_crop_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_easy'], 'crop'), axis=1)
df['multiple_choice_prediction_medium_crop_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_medium'], 'crop'), axis=1)
df['multiple_choice_prediction_hard_crop_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_hard'], 'crop'), axis=1)



##### Saturation

In [63]:
# apply saturation augmentation
df['multiple_choice_prediction_easy_saturation_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_easy'], 'saturation'), axis=1)
df['multiple_choice_prediction_medium_saturation_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_medium'], 'saturation'), axis=1)
df['multiple_choice_prediction_hard_saturation_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_hard'], 'saturation'), axis=1)




##### Noise

In [64]:
# apply noise augmentation
df['multiple_choice_prediction_easy_noise_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_easy'], 'noise'), axis=1)
df['multiple_choice_prediction_medium_noise_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_medium'], 'noise'), axis=1)
df['multiple_choice_prediction_hard_noise_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['url'], x['multiple_choice_question_hard'], 'noise'), axis=1)



##### Save the result

In [65]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_phi.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_phi.json


##### Evalutation

In [66]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/stephen/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_phi.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,url,captions,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy_Phi,multiple_choice_prediction_medium_Phi,multiple_choice_prediction_hard_Phi,multiple_choice_prediction_easy_crop_phi,multiple_choice_prediction_medium_crop_phi,multiple_choice_prediction_hard_crop_phi,multiple_choice_prediction_easy_saturation_phi,multiple_choice_prediction_medium_saturation_phi,multiple_choice_prediction_hard_saturation_phi,multiple_choice_prediction_easy_noise_phi,multiple_choice_prediction_medium_noise_phi,multiple_choice_prediction_hard_noise_phi
0,512533,COCO_train2014_000000512533.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000512533.jpg,"[A group of people riding on the back of an elephant., Four people ride on top of an elephant, there are many people that are riding a elephant , A group of people riding on top of a large elephant., Several people sit on top of an elephant as another person watches.]",D,Question: Which of the following captions best describes the painting?\n\nA) A group of people riding horses across a field.\n\nB) A family having a picnic in a park.\n\nC) A group of people rowing a boat on a river.\n\nD) A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A group of children playing in a park with a large ball.\n\nB. Several tourists sitting inside a colorful hot air balloon.\n\nC. A family gathered around a table enjoying dinner together.\n\nD. A group of people riding on the back of an elephant.,Which of the following captions best describes the painting?\n\nA. A family having a picnic under a large oak tree.\n\nB. A group of friends playing soccer in a park.\n\nC. A couple dancing under the stars at a wedding.\n\nD. A group of people riding on the back of an elephant.,D,D,D,D,D,D,D,D,D,D,D,D


In [67]:
def calculate_multiple_choice_question_accuracy_with_augmentation(df):
    # Calculate accuracy

    # crop augmentation
    accuracy_easy_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_crop_phi"]).mean()
    accuracy_medium_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_crop_phi"]).mean()
    accuracy_hard_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_crop_phi"]).mean()

    # saturation augmentation
    accuracy_easy_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_saturation_phi"]).mean()
    accuracy_medium_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_saturation_phi"]).mean()
    accuracy_hard_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_saturation_phi"]).mean()

    # noise augmentation
    accuracy_easy_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_noise_phi"]).mean()
    accuracy_medium_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_noise_phi"]).mean()
    accuracy_hard_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_noise_phi"]).mean()


    print('***** Prediction Accuracy with Crop Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_crop * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium_crop * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_crop * 100:.2f}%")
    print('\n')

    print('***** Prediction Accuracy with Saturation Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_saturation * 100:.2f}%")
    print('\n')

    print('***** Prediction Accuracy with Noise Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_noise * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_noise * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_noise * 100:.2f}%")
    print('\n')


    return accuracy_easy_crop, accuracy_medium_crop, accuracy_hard_crop, accuracy_easy_saturation, accuracy_medium_saturation, accuracy_hard_saturation, accuracy_easy_noise, accuracy_medium_noise, accuracy_hard_noise

In [68]:
calculate_multiple_choice_question_accuracy_with_augmentation(df)

***** Prediction Accuracy with Crop Augmentation *****
Prediction Accuracy Easy: 96.00%
Prediction Accuracy Medium: 91.00%
Prediction Accuracy Hard: 83.00%


***** Prediction Accuracy with Saturation Augmentation *****
Prediction Accuracy Easy: 98.00%
Prediction Accuracy Medium: 91.00%
Prediction Accuracy Hard: 83.00%


***** Prediction Accuracy with Noise Augmentation *****
Prediction Accuracy Easy: 91.00%
Prediction Accuracy Medium: 84.00%
Prediction Accuracy Hard: 71.00%




(0.96, 0.91, 0.83, 0.98, 0.91, 0.83, 0.91, 0.84, 0.71)

## ~~Perform Image Captioning Task~~

In [None]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/food_data/food_annotation_with_MCQ.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

In [None]:
# helper function: 
def perform_image_captioning_task_llava(img_url):
    image = Image.open(img_url)
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": "Generate a caption for this image."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    caption = output.split('[/INST]')[1].strip()
    print(caption)
    return caption

In [None]:
df['predicted_caption'] = df.apply(lambda x: perform_image_captioning_task_llava(x['img_url']), axis=1)

In [None]:
df[['reference_caption', 'predicted_caption']]

In [None]:
# Save the image_captioning result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/food_data/food_annotation_with_image_captioning_result.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

## Evaluate results

In [None]:
# import libraries
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

### Multiple Choice Accuracy

In [None]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/food_data/food_annotation_with_MCQ_result.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

In [None]:
def calculate_multiple_choice_question_accuracy(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [None]:
calculate_multiple_choice_question_accuracy(df)

### Caption Quality


In [None]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/food_data/food_annotation_with_image_captioning_result.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

In [None]:
# BLEU

In [None]:
from nltk.translate.bleu_score import sentence_bleu


# BLEU Evaluation (Average across multiple references)
def evaluate_bleu(df):
    bleu_scores = []
    
    for _, row in df.iterrows():
        references = row['reference_caption']  # List of reference captions
        candidate = row['predicted_caption']  # Predicted caption

        # Tokenize the candidate and reference captions
        tokenized_references = [ref.strip('"').split() for ref in references]  # List of tokenized references
        tokenized_candidate = candidate.strip('"').split()  # Tokenized candidate

        # Compute BLEU for all references
        row_bleu_scores = [
            sentence_bleu([ref], tokenized_candidate) for ref in tokenized_references
        ]
        
        # Average across references
        bleu_scores.append(sum(row_bleu_scores) / len(row_bleu_scores))
    
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU: {avg_bleu:.4f}")
    return bleu_scores

In [None]:
evaluate_bleu(df)

In [None]:
# METEOR

In [None]:
import nltk
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score

# METEOR Evaluation (Average across multiple references)
def evaluate_meteor(df):
    meteor_scores = []
    
    for _, row in df.iterrows():
        references = row['reference_caption']  # List of reference captions
        candidate = row['predicted_caption'].strip('"')  # Predicted caption (raw string)

        # Compute METEOR for all references
        row_meteor_scores = [
            meteor_score([[ref]], [candidate]) for ref in references
        ]
        
        # Average across references
        meteor_scores.append(sum(row_meteor_scores) / len(row_meteor_scores))
    
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR: {avg_meteor:.4f}")
    return meteor_scores


In [None]:
evaluate_meteor(df)

In [None]:
# ROUGE

In [None]:
# ROUGE Evaluation (Average across multiple references)
def evaluate_rouge(df):
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for _, row in df.iterrows():
        references = row['reference_caption']  # List of reference captions
        candidate = row['predicted_caption']  # Predicted caption
        
        # Compute ROUGE scores for all references
        row_rouge1_scores, row_rouge2_scores, row_rougeL_scores = [], [], []
        for ref in references:
            scores = scorer.score(ref, candidate.strip('"'))
            row_rouge1_scores.append(scores['rouge1'].fmeasure)
            row_rouge2_scores.append(scores['rouge2'].fmeasure)
            row_rougeL_scores.append(scores['rougeL'].fmeasure)
        
        # Average across references
        rouge1_scores.append(sum(row_rouge1_scores) / len(row_rouge1_scores))
        rouge2_scores.append(sum(row_rouge2_scores) / len(row_rouge2_scores))
        rougeL_scores.append(sum(row_rougeL_scores) / len(row_rougeL_scores))
    
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    return rouge1_scores, rouge2_scores, rougeL_scores

In [None]:
evaluate_rouge(df)

In [None]:
# CIDEr and SPICE (unchanged, since they handle multiple references internally)

In [None]:
def evaluate_cider(df):
    ref_dict = {str(idx): row['reference_caption'] for idx, row in df.iterrows()}
    cand_dict = {str(idx): [row['predicted_caption']] for idx, row in df.iterrows()}
    
    cider_scorer = Cider()
    score, _ = cider_scorer.compute_score(ref_dict, cand_dict)
    print(f"Average CIDEr: {score:.4f}")
    return score

In [None]:
evaluate_cider(df)

In [None]:
def evaluate_spice(df):
    ref_dict = {str(idx): row['reference_caption'] for idx, row in df.iterrows()}
    cand_dict = {str(idx): [row['predicted_caption']] for idx, row in df.iterrows()}
    
    spice_scorer = Spice()
    score, _ = spice_scorer.compute_score(ref_dict, cand_dict)
    print(f"Average SPICE: {score:.4f}")
    return score

In [None]:
evaluate_spice(df)