In [1]:
def official_gpt4eval_baseline_prompt():
    system_msg = ""
    user_msg = "Your task is to evaluate whether a given text caption accurately represents the main content and objects of an associated image. While the caption need not describe every detail of the image, it should convey the overall theme or subject. After your evaluation, rate the quality of the text caption’s match to the image on a scale of 1-100, with 100 being a perfect match. Caption: '{img_caption}' \n"
    user_msg += "Provide your evaluation in JSON format, including keys for 'score' and 'reasoning'."
    return system_msg, user_msg

from evaluation import GPT4V_Winoground_Evals as evals

# Arguments
master_folder_path = "./"
save_folder_name= "logs/eqben-mini/official_gpt4eval_baseline-log-100-140/"            # Log folder path (change me)
system_msg, user_msg = official_gpt4eval_baseline_prompt()      # Defines prompt (change me)
openai_api_key = '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' # Emily's key

id_list = list(range(100, 140))                             # List of winoground ids to evaluate (change me)

# Define evaluator & Run evaluation
evaluator = evals(master_folder_path, save_folder_name, openai_api_key, dataset_name="eqben-mini",
                  post_processing_fn=None, # Post processing function to apply to the generated text. Default: None
                  system_prompt=system_msg, user_prompt=user_msg,
                  api_max_retries=2, # Number of times to retry API call before giving up
                  )


In [2]:
import base64
import random
import os
import time
import winoground
import numpy as np
import traceback
import ast
from io import BytesIO


# Read the file path and process the file
def process_results(name: str, file_path: str, return_entirety=False):
    scores = {}
    ids = []
    total_image_score = 0
    total_text_score = 0
    total_group_score = 0
    total_samples = 0

    score_matrix = np.zeros((len(evaluator.dataset), 2, 2))

    if return_entirety:
        all_data = {}

    # Store indices failure cases
    failure_cases = {'text_score_fc': [],
                    'image_score_fc': [],
                    'group_score_fc': [],
                    'total_ids': []}

    with open(file_path, 'r') as file:
        while True:
            # Read two lines from the file
            line1 = file.readline()
            line2 = file.readline()

            # Check if either line is empty (end of file)
            if not line1 or not line2:
                break

            line = (line1.strip() + line2.strip()).replace("array(", "").replace("])", "]").replace(".,", ",").replace(".]", "]")
            data = ast.literal_eval(line)

            # Extract the ID
            if data['id'] in ids:
                print(f"Dup found! ID {data['id']}. skipping...")
                continue

            else:
                id = data['id']

            if return_entirety:
                all_data[id] = data

            ids.append(id)

            score_matrix[id] = data['scores']

    if return_entirety:
        return sorted(ids), #scores, failure_cases, all_data
    else:
        return sorted(ids), score_matrix #scores, failure_cases

In [3]:
log_file_path = './logs/eqben-mini/official_gpt4eval_baseline-log/res_evaluations_log.txt'
ids, score_matrix = process_results("eqben mini gpt4v", log_file_path)
print(len(ids))
print(score_matrix[60])

139
[[0. 0.]
 [0. 0.]]


In [4]:
evaluator.dataset.evaluate_scores(score_matrix)

EQBen_Mini performance (overall)
Dataset                                                                Text       Image      Group     
EQBen_Mini                                                             42.86      40.00      35.00     
EQBen_Mini eqbensd                                                     65.00      75.00      65.00     
EQBen_Mini eqbenk                                                      30.00      23.33      21.67     
EQBen_Mini eqbeng                                                      30.00      30.00      25.00     
EQBen_Mini eqbenag                                                     30.00      25.00      15.00     
EQBen_Mini eqbeny                                                      85.00      80.00      75.00     


({'all': {'text': 42.8571, 'image': 40.0, 'group': 35.0},
  'eqbensd': {'text': 65.0, 'image': 75.0, 'group': 65.0},
  'eqbenk': {'text': 30.0, 'image': 23.3333, 'group': 21.6667},
  'eqbeng': {'text': 30.0, 'image': 30.0, 'group': 25.0},
  'eqbenag': {'text': 30.0, 'image': 25.0, 'group': 15.0},
  'eqbeny': {'text': 85.0, 'image': 80.0, 'group': 75.0}},
 35.0)