## Load Cat Dataset

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json

In [None]:
# Data Directory: 
upking_annotation_file_path = '/shared/data/upking/upking_data.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(upking_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]"


In [3]:
df.shape

(100, 3)

## Generate Multiple Choice Question

In [4]:
# Generate random choice in [A, B, C, D]
import random

def generate_random_choice():
    return random.choice(['A', 'B', 'C', 'D'])

In [5]:
df['multiple_choice_solution'] = df.apply(lambda x: generate_random_choice(), axis=1)

In [6]:
df.head()

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A
1,2,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0096.jpg,[Many grilled pork ribs are arranged in a curiously shaped wooden cutlery.],D
2,3,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0091.jpg,[Two grilled brown-red pork ribs on an oval white dinner plate.],A
3,4,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0093.jpg,[Three pork ribs drizzled with a rich sauce and served on a white round dinner plate.],A
4,5,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0084.jpg,"[Raw, long, fresh pork ribs next to yellow, good potatoes.]",A


In [7]:
import os
from openai import OpenAI
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
)


def generate_multiple_choice_question(reference_caption, correct_choice, level='medium'): 
    # Define the prompt to generate inferior choices
    if level == 'easy':
        level_message = "The distractors are obviously incorrect but still loosely related to the context."
    elif level == 'medium':
        level_message = "The distractors are somewhat related to the context but contain inaccuracies or non-fluent language."
    elif level == 'hard':
        level_message = "The distractors are closely related to the context but may confuse someone without careful observation."


    prompt = f"""
    Given the ground truth caption below:
    "{reference_caption}"
    Generate three plausible but incorrect distractors.
    "{level_message}"
    Format the result as a multiple-choice question. 
    Question title should be "Which of the following captions best describes the painting?".
    The correct choice should be placed at choice "{correct_choice}". 
    Do not generate special symbols such as '*'.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=200,
    )

    # Extract the generated multiple-choice question
    question = response.choices[0].message.content    
    
    return question

In [8]:
# generate hard question 
df['multiple_choice_question_hard'] = df.apply(lambda x: generate_multiple_choice_question(x['reference_caption'][0], x['multiple_choice_solution'], level='hard'), axis=1)

In [9]:
# generate medium question
df['multiple_choice_question_medium'] = df.apply(lambda x: generate_multiple_choice_question(x['reference_caption'][0], x['multiple_choice_solution'], level='medium'), axis=1)

In [10]:
# generate easy question
df['multiple_choice_question_easy'] = df.apply(lambda x: generate_multiple_choice_question(x['reference_caption'][0], x['multiple_choice_solution'], level='easy'), axis=1)

In [12]:
df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.","Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden table, there is a whole raw pork rib with sauce.\n\nC. On the wooden chopping board, a raw fish is being sliced in half.\n\nD. On the chopping board, there is a grilled chicken leg with vegetables.","Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole raw fish ready to be cooked.\n\nC. On the wooden chopping board, there is a freshly baked loaf of bread.\n\nD. On the wooden chopping board, there is a colorful assortment of fresh fruits."


In [15]:
df.iloc[0].multiple_choice_question_hard

'Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.'

In [None]:
# Save the annotation with multiple choice question to output file

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/upking/upking_annotation_with_MCQ.json"
with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

## Perform Multiple Choice Selection

In [None]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
upking_annotation_file_path = '/shared/data/upking/upking_annotation_with_MCQ.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(upking_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy,multiple_choice_prediction_medium,multiple_choice_prediction_hard,multiple_choice_prediction_easy_crop,multiple_choice_prediction_medium_crop,multiple_choice_prediction_hard_crop
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.","Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden table, there is a whole raw pork rib with sauce.\n\nC. On the wooden chopping board, a raw fish is being sliced in half.\n\nD. On the chopping board, there is a grilled chicken leg with vegetables.","Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole raw fish ready to be cooked.\n\nC. On the wooden chopping board, there is a freshly baked loaf of bread.\n\nD. On the wooden chopping board, there is a colorful assortment of fresh fruits.",A,A,A,A,A,A


### Use llava model

In [1]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import skimage.io as io
import requests 

In [2]:
# Load model from local directory 
model_path = '/shared/model/llava-v1.6-mistral-7b-hf'

processor = LlavaNextProcessor.from_pretrained(model_path)

model = LlavaNextForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, load_in_4bit=True) 
#model.to("cuda:0")

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Without image augmentation

In [None]:
# helper function: 
def perform_multiple_choice_task_llava(img_url, question):
    image = io.imread(img_url)
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    # mcq_question = output.split('[/INST]')[0].split('[INST] ')[1].strip()
    mcq_answer = output.split('[/INST]')[1].strip()
    return mcq_answer

In [21]:
df['multiple_choice_prediction_easy'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['img_url'], x['multiple_choice_question_easy']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

In [22]:
df['multiple_choice_prediction_medium'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['img_url'], x['multiple_choice_question_medium']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [23]:
df['multiple_choice_prediction_hard'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['img_url'], x['multiple_choice_question_hard']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [4]:
df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy,multiple_choice_prediction_medium,multiple_choice_prediction_hard,multiple_choice_prediction_easy_crop,multiple_choice_prediction_medium_crop,multiple_choice_prediction_hard_crop
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.","Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden table, there is a whole raw pork rib with sauce.\n\nC. On the wooden chopping board, a raw fish is being sliced in half.\n\nD. On the chopping board, there is a grilled chicken leg with vegetables.","Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole raw fish ready to be cooked.\n\nC. On the wooden chopping board, there is a freshly baked loaf of bread.\n\nD. On the wooden chopping board, there is a colorful assortment of fresh fruits.",A,A,A,A,A,A


In [None]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_llama.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/food_data/food_annotation_with_MCQ_result_3_difficulties.json


In [15]:
def calculate_multiple_choice_question_accuracy(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [16]:
calculate_multiple_choice_question_accuracy(df)

Prediction Accuracy Easy: 98.00%
Prediction Accuracy Medium: 93.00%
Prediction Accuracy Hard: 88.00%


(0.98, 0.93, 0.88)

#### With Image Augmentation

In [None]:
# This function will apply the augmentation to the image
# sin_aug: single augmentation, includes flip, rotate, crop, saturation, artStyle, noise, blur
# mul_aug: multiple augmentations in a list
# Return a dictionary. Key: augmentation name; Value: augmented image
from skimage import io
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random
import pandas as pd

def image_augumentation(ori_img, sin_aug = None, mul_aug = None):
    img = ori_img.copy()
    aug_img = {}
    if sin_aug:
        if sin_aug == 'flip':
            aug_img['flip'] = flip_image(img)
        elif sin_aug == 'rotate':
            aug_img['rotate'] = rotate_image(img)
        elif sin_aug == 'crop':
            aug_img['crop'] = random_crop(img)
        elif sin_aug == 'saturation':
            aug_img['saturation'] = adjust_saturation(img)
        elif sin_aug == 'artStyle':
            aug_img['artStyle'] = convert_to_artStyle(img)
        elif sin_aug == 'noise':
            aug_img['noise'] = add_noise(img)
        elif sin_aug == 'blur':
            aug_img['blur'] = blur_image(img)
        else:
            aug_img['original'] = img

    elif mul_aug:
        for aug in mul_aug:
            if aug == 'flip':
                aug_img['flip'] = flip_image(img)
            elif aug == 'rotate':
                aug_img['rotate'] = rotate_image(img)
            elif aug == 'crop':
                aug_img['crop'] = random_crop(img)
            elif aug == 'saturation':
                aug_img['saturation'] = adjust_saturation(img)
            elif aug == 'artStyle':
                aug_img['artStyle'] = convert_to_artStyle(img)
            elif aug == 'noise':
                aug_img['noise'] = add_noise(img)
            elif aug == 'blur':
                aug_img['blur'] = blur_image(img)
            else:
                aug_img['original'] = img
    else:
        aug_img['original'] = img
    return aug_img

def flip_image(image):
    img = np.flip(image, axis=1)
    return img

def rotate_image(image, angle_range=(-30, 30)):
    angle = random.uniform(*angle_range)
    h, w = image.shape[:2]
    center = (w // 2, h // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale=1.0)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR)
    return rotated_image

def random_crop(image, percent=0.7):
    h, w = image.shape[:2]
    # crop_h, crop_w = crop_size
    crop_h = round(percent * h)
    crop_w = round(percent * w)

    top = random.randint(0, h - crop_h)
    left = random.randint(0, w - crop_w)
    cropped_image = image[top:top + crop_h, left:left + crop_w]
    return cropped_image

def adjust_saturation(image, factor=5):

    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    enhancer = ImageEnhance.Color(image_pil)
    saturated_image = enhancer.enhance(factor)
    return cv2.cvtColor(np.array(saturated_image), cv2.COLOR_RGB2BGR)

def convert_to_artStyle(image):
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold
    art_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 11, 2)
    return art_image

def add_noise(image, mean=0, stddev=25):
    noise = np.random.normal(mean, stddev, image.shape).astype(np.uint8)
    noisy_image = cv2.add(image, noise)
    return noisy_image

def blur_image(image, kernel_size=(5, 5)):
    return cv2.GaussianBlur(image, kernel_size, 0)

In [7]:
# helper function: 
def perform_multiple_choice_task_with_image_augmentation_llava(img_url, question, aug_type=None):
    # image = Image.open(img_url)

    # # Convert the PIL Image to a NumPy array
    # image = np.array(image)

    image = io.imread(img_url)

    # Apply image augmentation
    if (aug_type is not None):
        image = image_augumentation(image, sin_aug = aug_type)[aug_type]  
    
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    # mcq_question = output.split('[/INST]')[0].split('[INST] ')[1].strip()
    mcq_answer = output.split('[/INST]')[1].strip()
    return mcq_answer

In [None]:
df = pd.read_json("/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_llama.json")

In [8]:
df = pd.read_json("/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_llama.json")

##### Cropping

In [8]:
# Cropping - easy
df['multiple_choice_prediction_easy_crop'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_easy'], 'crop'), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

In [10]:
# Cropping - medium
df['multiple_choice_prediction_medium_crop'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_medium'], 'crop'), axis=1)
# Cropping - hard
df['multiple_choice_prediction_hard_crop'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_hard'], 'crop'), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

##### saturation

In [9]:
# Saturation - easy
df['multiple_choice_prediction_easy_saturation'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_easy'], 'saturation'), axis=1)
# Saturation - medium
df['multiple_choice_prediction_medium_saturation'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_medium'], 'saturation'), axis=1)
# Saturation - hard
df['multiple_choice_prediction_hard_saturation'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_hard'], 'saturation'), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

##### Noise

In [9]:
# Noise - easy
df['multiple_choice_prediction_easy_noise'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_easy'], 'noise'), axis=1)
# Noise - medium
df['multiple_choice_prediction_medium_noise'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_medium'], 'noise'), axis=1)
# Noise - hard
df['multiple_choice_prediction_hard_noise'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_llava(x['img_url'], x['multiple_choice_question_hard'], 'noise'), axis=1)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

##### Save the result

In [11]:
# Save the MCQ result
import json
# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_llama.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_llama.json


##### Evaluation

In [12]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
upking_annotation_file_path = '/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_llama.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(upking_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy,multiple_choice_prediction_medium,multiple_choice_prediction_hard,multiple_choice_prediction_easy_crop,multiple_choice_prediction_medium_crop,multiple_choice_prediction_hard_crop,multiple_choice_prediction_easy_saturation,multiple_choice_prediction_medium_saturation,multiple_choice_prediction_hard_saturation,multiple_choice_prediction_easy_noise,multiple_choice_prediction_medium_noise,multiple_choice_prediction_hard_noise
0,140360,COCO_train2014_000000140360.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000140360.jpg,"[a black cat getting some water out of a bowl , A black cat drinking water from a bowl and his food bowl is next to it. , A cat that is standing near a bowl of water., A cat drinking water out of a water bowl., A cat standing by a bowl of water and food ]",C,**Which of the following captions best describes the image?**\n\nA) A black cat playing with a ball on the floor.\n\nB) A black cat sleeping next to a bowl of water.\n\nC) A black cat getting some water out of a bowl.\n\nD) A black cat sitting near a bowl of food.,Question: Which of the following captions best describes the image?\n\nA. a black dog drinking water from a cup\n\nB. a cat playing with a toy on the floor\n\nC. a black cat getting some water out of a bowl\n\nD. a black cat sleeping in a basket,Which of the following captions best describes the image?\n\nA. A black dog playing with a ball in the yard.\n\nB. A white cat sitting on a sunny windowsill.\n\nC. A black cat getting some water out of a bowl.\n\nD. A kitten chasing a butterfly in a garden.,C,C,D,C,C,C,C,C,D,C,A,C


In [13]:
def calculate_multiple_choice_question_accuracy_with_augmentation(df):
    # Calculate accuracy

    # crop augmentation
    accuracy_easy_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_crop"]).mean()
    accuracy_medium_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_crop"]).mean()
    accuracy_hard_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_crop"]).mean()

    # saturation augmentation
    accuracy_easy_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_saturation"]).mean()
    accuracy_medium_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_saturation"]).mean()
    accuracy_hard_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_saturation"]).mean()

    # noise augmentation
    accuracy_easy_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_noise"]).mean()
    accuracy_medium_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_noise"]).mean()
    accuracy_hard_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_noise"]).mean()


    print('***** Prediction Accuracy with Crop Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_crop * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium_crop * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_crop * 100:.2f}%")

    print('***** Prediction Accuracy with Saturation Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_saturation * 100:.2f}%")

    print('***** Prediction Accuracy with Noise Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_noise * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_noise * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_noise * 100:.2f}%")


    return accuracy_easy_crop, accuracy_medium_crop, accuracy_hard_crop, accuracy_easy_saturation, accuracy_medium_saturation, accuracy_hard_saturation, accuracy_easy_noise, accuracy_medium_noise, accuracy_hard_noise

In [14]:
calculate_multiple_choice_question_accuracy_with_augmentation(df)

***** Prediction Accuracy with Crop Augmentation *****
Prediction Accuracy Easy: 98.00%
Prediction Accuracy Medium: 90.00%
Prediction Accuracy Hard: 90.00%
***** Prediction Accuracy with Saturation Augmentation *****
Prediction Accuracy Easy: 98.00%
Prediction Accuracy Medium: 89.00%
Prediction Accuracy Hard: 83.00%
***** Prediction Accuracy with Noise Augmentation *****
Prediction Accuracy Easy: 81.00%
Prediction Accuracy Medium: 56.00%
Prediction Accuracy Hard: 58.00%


(0.98, 0.9, 0.9, 0.98, 0.89, 0.83, 0.81, 0.56, 0.58)

### Use Phi

In [16]:
# Load question dataframe
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json

# Data Directory: 
upking_annotation_file_path = '/shared/data/upking/upking_annotation_with_MCQ.json'

df = pd.read_json(upking_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_name,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy
0,140360,COCO_train2014_000000140360.jpg,http://images.cocodataset.org/train2014/COCO_train2014_000000140360.jpg,"[a black cat getting some water out of a bowl , A black cat drinking water from a bowl and his food bowl is next to it. , A cat that is standing near a bowl of water., A cat drinking water out of a water bowl., A cat standing by a bowl of water and food ]",C,**Which of the following captions best describes the image?**\n\nA) A black cat playing with a ball on the floor.\n\nB) A black cat sleeping next to a bowl of water.\n\nC) A black cat getting some water out of a bowl.\n\nD) A black cat sitting near a bowl of food.,Question: Which of the following captions best describes the image?\n\nA. a black dog drinking water from a cup\n\nB. a cat playing with a toy on the floor\n\nC. a black cat getting some water out of a bowl\n\nD. a black cat sleeping in a basket,Which of the following captions best describes the image?\n\nA. A black dog playing with a ball in the yard.\n\nB. A white cat sitting on a sunny windowsill.\n\nC. A black cat getting some water out of a bowl.\n\nD. A kitten chasing a butterfly in a garden.


In [17]:
from PIL import Image 
import requests 
from transformers import AutoModelForCausalLM 
from transformers import AutoProcessor 

model_id = "/shared/model/Phi-3.5-vision-instruct" 

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='eager'    
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



#### Without image augmentation

In [5]:
# helper function: 
def perform_multiple_choice_task_Phi(img_url, question):
    image = Image.open(img_url)

    images = []
    images.append(image)

    messages = [
        {"role": "user", "content": "<|image_1|>\n" + question + "\nOnly return the correct choice with a single letter."},
    ]

    prompt = processor.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 

    generation_args = { 
        "max_new_tokens": 10, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 

    generate_ids = model.generate(**inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
    )

    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0] 

    return response


In [6]:
df['multiple_choice_prediction_easy_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['img_url'], x['multiple_choice_question_easy']), axis=1)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


In [None]:
df['multiple_choice_prediction_medium_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['img_url'], x['multiple_choice_question_medium']), axis=1)

In [None]:
df['multiple_choice_prediction_hard_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['img_url'], x['multiple_choice_question_hard']), axis=1)

In [None]:
# # Save the prediction results to output file
# # Convert DataFrame to a list of dictionaries
# list_of_dicts = df.to_dict(orient="records")

# # Save the list of dictionaries to a JSON file
# output_file = "llava_prediction_result_food_image.json"
# with open(output_file, "w") as file:
#     json.dump(list_of_dicts, file, indent=4)

# print(f"DataFrame saved as a list of dictionaries in {output_file}")

In [7]:
# Evaluate the performance of the model
def calculate_multiple_choice_question_accuracy_Phi(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_Phi"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_Phi"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_Phi"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [8]:
calculate_multiple_choice_question_accuracy_Phi(df)

Prediction Accuracy: 87.00%


0.87

#### With Image Augmentation

In [None]:
# This function will apply the augmentation to the image
# sin_aug: single augmentation, includes flip, rotate, crop, saturation, artStyle, noise, blur
# mul_aug: multiple augmentations in a list
# Return a dictionary. Key: augmentation name; Value: augmented image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random

def image_augumentation(ori_img, sin_aug = None, mul_aug = None):
    img = ori_img.copy()
    aug_img = {}
    if sin_aug:
        if sin_aug == 'flip':
            aug_img['flip'] = flip_image(img)
        elif sin_aug == 'rotate':
            aug_img['rotate'] = rotate_image(img)
        elif sin_aug == 'crop':
            aug_img['crop'] = random_crop(img)
        elif sin_aug == 'saturation':
            aug_img['saturation'] = adjust_saturation(img)
        elif sin_aug == 'artStyle':
            aug_img['artStyle'] = convert_to_artStyle(img)
        elif sin_aug == 'noise':
            aug_img['noise'] = add_noise(img)
        elif sin_aug == 'blur':
            aug_img['blur'] = blur_image(img)
        else:
            aug_img['original'] = img

    elif mul_aug:
        for aug in mul_aug:
            if aug == 'flip':
                aug_img['flip'] = flip_image(img)
            elif aug == 'rotate':
                aug_img['rotate'] = rotate_image(img)
            elif aug == 'crop':
                aug_img['crop'] = random_crop(img)
            elif aug == 'saturation':
                aug_img['saturation'] = adjust_saturation(img)
            elif aug == 'artStyle':
                aug_img['artStyle'] = convert_to_artStyle(img)
            elif aug == 'noise':
                aug_img['noise'] = add_noise(img)
            elif aug == 'blur':
                aug_img['blur'] = blur_image(img)
            else:
                aug_img['original'] = img
    else:
        aug_img['original'] = img
    return aug_img

def flip_image(image):
    img = np.flip(image, axis=1)
    return img

def rotate_image(image, angle_range=(-30, 30)):
    angle = random.uniform(*angle_range)
    h, w = image.shape[:2]
    center = (w // 2, h // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale=1.0)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (w, h), flags=cv2.INTER_LINEAR)
    return rotated_image

def random_crop(image, percent=0.7):
    h, w = image.shape[:2]
    # crop_h, crop_w = crop_size
    crop_h = round(percent * h)
    crop_w = round(percent * w)

    top = random.randint(0, h - crop_h)
    left = random.randint(0, w - crop_w)
    cropped_image = image[top:top + crop_h, left:left + crop_w]
    return cropped_image

def adjust_saturation(image, factor=5):

    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    enhancer = ImageEnhance.Color(image_pil)
    saturated_image = enhancer.enhance(factor)
    return cv2.cvtColor(np.array(saturated_image), cv2.COLOR_RGB2BGR)

def convert_to_artStyle(image):
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold
    art_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 11, 2)
    return art_image

def add_noise(image, mean=0, stddev=25):
    noise = np.random.normal(mean, stddev, image.shape).astype(np.uint8)
    noisy_image = cv2.add(image, noise)
    return noisy_image

def blur_image(image, kernel_size=(5, 5)):
    return cv2.GaussianBlur(image, kernel_size, 0)

In [7]:
def perform_multiple_choice_task_with_image_augmentation_phi(img_url, question, aug_type=None):
    image = Image.open(img_url)

    # Convert the PIL Image to a NumPy array 
    image = np.array(image)

    # Apply image augmentation
    if (aug_type is not None):
        image = image_augumentation(image, sin_aug = aug_type)[aug_type]  
    
    image = Image.fromarray(image)

    images = []
    images.append(image)

    messages = [
        {"role": "user", "content": "<|image_1|>\n" + question + "\nOnly return the correct choice with a single letter."},
    ]

    prompt = processor.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 

    generation_args = { 
        "max_new_tokens": 10, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 

    generate_ids = model.generate(**inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
    )

    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0] 

    return response


##### Cropping

In [8]:
# apply cropping augmentation
df['multiple_choice_prediction_easy_crop_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_easy'], 'crop'), axis=1)
df['multiple_choice_prediction_medium_crop_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_medium'], 'crop'), axis=1)
df['multiple_choice_prediction_hard_crop_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_hard'], 'crop'), axis=1)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


##### Saturation

In [9]:
# apply saturation augmentation
df['multiple_choice_prediction_easy_saturation_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_easy'], 'saturation'), axis=1)
df['multiple_choice_prediction_medium_saturation_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_medium'], 'saturation'), axis=1)
df['multiple_choice_prediction_hard_saturation_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_hard'], 'saturation'), axis=1)




##### Noise

In [10]:
# apply noise augmentation
df['multiple_choice_prediction_easy_noise_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_easy'], 'noise'), axis=1)
df['multiple_choice_prediction_medium_noise_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_medium'], 'noise'), axis=1)
df['multiple_choice_prediction_hard_noise_phi'] = df.apply(lambda x: perform_multiple_choice_task_with_image_augmentation_phi(x['img_url'], x['multiple_choice_question_hard'], 'noise'), axis=1)



##### Save the result

In [None]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_phi.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/food_data/food_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_phi.json


##### Evalutation

In [None]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
upking_annotation_file_path = '/shared/data/upking/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation_phi.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(upking_annotation_file_path)

df.head(1)

In [12]:
def calculate_multiple_choice_question_accuracy_with_augmentation(df):
    # Calculate accuracy

    # crop augmentation
    accuracy_easy_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_crop_phi"]).mean()
    accuracy_medium_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_crop_phi"]).mean()
    accuracy_hard_crop = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_crop_phi"]).mean()

    # saturation augmentation
    accuracy_easy_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_saturation_phi"]).mean()
    accuracy_medium_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_saturation_phi"]).mean()
    accuracy_hard_saturation = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_saturation_phi"]).mean()

    # noise augmentation
    accuracy_easy_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_noise_phi"]).mean()
    accuracy_medium_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_noise_phi"]).mean()
    accuracy_hard_noise = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_noise_phi"]).mean()


    print('***** Prediction Accuracy with Crop Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_crop * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium_crop * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_crop * 100:.2f}%")
    print('\n')

    print('***** Prediction Accuracy with Saturation Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_saturation * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_saturation * 100:.2f}%")
    print('\n')

    print('***** Prediction Accuracy with Noise Augmentation *****')
    print(f"Prediction Accuracy Easy: {accuracy_easy_noise * 100:.2f}%")
    print(f"Prediction Accuracy Medium: {accuracy_medium_noise * 100:.2f}%")
    print(f"Prediction Accuracy Hard: {accuracy_hard_noise * 100:.2f}%")
    print('\n')


    return accuracy_easy_crop, accuracy_medium_crop, accuracy_hard_crop, accuracy_easy_saturation, accuracy_medium_saturation, accuracy_hard_saturation, accuracy_easy_noise, accuracy_medium_noise, accuracy_hard_noise

In [13]:
calculate_multiple_choice_question_accuracy_with_augmentation(df)

***** Prediction Accuracy with Crop Augmentation *****
Prediction Accuracy Easy: 97.00%
Prediction Accuracy Medium: 89.00%
Prediction Accuracy Hard: 84.00%


***** Prediction Accuracy with Saturation Augmentation *****
Prediction Accuracy Easy: 93.00%
Prediction Accuracy Medium: 81.00%
Prediction Accuracy Hard: 76.00%


***** Prediction Accuracy with Noise Augmentation *****
Prediction Accuracy Easy: 86.00%
Prediction Accuracy Medium: 80.00%
Prediction Accuracy Hard: 66.00%




(0.97, 0.89, 0.84, 0.93, 0.81, 0.76, 0.86, 0.8, 0.66)