In [40]:
import json
import os
import time
from copy import deepcopy
from pathlib import Path

import numpy as np
import openai
import requests
import yaml
from loguru import logger as eval_logger
from openai import OpenAI

NUM_SECONDS_TO_SLEEP = 5

MULTIPLE_CHOICE_PROMPT = '\nAnswer with the option\'s letter from the given choices directly.'
OPEN_ENDED_PROMPT = ''


In [143]:
from datasets import load_dataset

task_name = 'PlantNet_Multi_Identification'
folder = '/workdir/important_datasets/AGRIVQA/'
dataset_name = "parquet"
data_files = {
    "dev": folder+task_name+"/dev-00000-of-00001.parquet",
    #"test": folder+task_name+"/test-00000-of-00001.parquet",
    "validation": folder+task_name+"/validation-00000-of-00001.parquet"
}
split = "dev"

dataset = load_dataset(dataset_name,data_files=data_files, split=split)

Generating dev split: 16 examples [00:00, 760.62 examples/s]
Generating validation split: 4067 examples [00:02, 1649.43 examples/s]


In [190]:
from datasets import load_dataset

task_name = 'PlantNet_Multi_Identification'
folder = '/workdir/important_datasets/AGRIVQA/'
dataset_name = "parquet"
data_files = {
    "dev": folder+task_name+"/dev-00000-of-00001.parquet",
    #"test": folder+task_name+"/test-00000-of-00001.parquet",
    "validation": folder+task_name+"/validation-00000-of-00001.parquet"
}
split = "dev"

dataset = load_dataset(dataset_name,data_files=data_files, split=split)

#### Here, the .yaml file is read and converted into a dictionary named config.

In [191]:
__file__= "/workdir/lmms-eval/lmms_eval/tasks/CABBAGE/CABBAGE.yaml"

#rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))

with open(Path(__file__).parent / "CABBAGE.yaml", "r") as f:
    raw_data = f.readlines()
    safe_data = []
    for i, line in enumerate(raw_data):
        # remove function definition since yaml load cannot handle it
        if "!function" not in line:
            safe_data.append(line)

    config = yaml.safe_load("".join(safe_data))

In [192]:
config['task'][0]

'CABBAGE_AgriExam'

In [193]:
config

{'group': 'CABBAGE',
 'task': ['CABBAGE_AgriExam', 'CABBAGE_Agri500P', 'CABBAGE_wikiHow'],
 'metadata': {'version': 0.0}}

#### Later


#### Here, as with all other tasks, the visuals and text are prepared for the models.

In [194]:
def replace_images_tokens(input_string):
    for i in range(1, 8):
        question_text = f"<image {i}>"
        query_text = "<image>"
        if question_text in input_string:
            input_string = input_string.replace(question_text, query_text)
    return input_string

def parse_options(options):
    option_letters = [chr(ord("A") + i) for i in range(len(options))]
    choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)])
    return choices_str

def CABBAGE_doc_to_text(doc, lmms_eval_specific_kwargs=None):
    question = replace_images_tokens(doc['question'])
    if lmms_eval_specific_kwargs is None:
        lmms_eval_specific_kwargs = {}
    post_prompt = ''
    if doc['question_type']=='multiple-choice':
        post_prompt = MULTIPLE_CHOICE_PROMPT
    elif doc['question_type']=='open-ended':
        post_prompt = OPEN_ENDED_PROMPT
        
    pre_prompt=''
    if doc.get('context'):
        pre_prompt = f'Context: {doc.get('context')}\n'

    if doc.get('options'):
        options = parse_options(doc['options'])
        return f"{pre_prompt}Question: {question}\n\nOptions:\n{options}\n{post_prompt}"
    return f"{pre_prompt}Question: {question}{post_prompt}"

In [195]:
dataset[0]

{'id': 'dev_PlantNet_Multi_Identification_1',
 'question': 'What is the scientific name of the plant shown in <image 1>?',
 'options': [],
 'explanation': '',
 'image_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=675x900>,
 'image_2': None,
 'image_3': None,
 'image_4': None,
 'image_5': None,
 'img_type': ['Picture'],
 'answer': 'Borago officinalis',
 'topic_difficulty': 5,
 'question_type': 'open-ended',
 'subfield': 'scientific_name_5',
 'metadata': {'author': 'PlantNet',
  'eppo_code': ['BOROF'],
  'event_date': ['2021-04-29T14:34:32Z'],
  'gbif_id': [3949249620],
  'gbif_key': [2926110],
  'kingdom': 'Plantae',
  'language': 'English',
  'license': 'CC BY-SA 4.0',
  'region': ['Europe'],
  'source': 'PlantNet',
  'tag': ['habit'],
  'url': ['https://bs.plantnet.org/image/o/06437fe7f3d1e038e4179e2b8ca30f878c97e317']}}

In [196]:
doc = dataset[0]
print(CABBAGE_doc_to_text(doc)) #config['task'][0]['lmms_eval_specific_kwargs']['default']

Question: What is the scientific name of the plant shown in <image>?


In [197]:
def CABBAGE_doc_to_visual(doc):
    visual = []
    for i in range(1,6):
        if doc.get(f'image_{i}'):
            visual.append(doc[f'image_{i}'].convert("RGB"))
    return visual

In [199]:
doc

{'id': 'dev_PlantNet_Multi_Identification_1',
 'question': 'What is the scientific name of the plant shown in <image 1>?',
 'options': [],
 'explanation': '',
 'image_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=675x900>,
 'image_2': None,
 'image_3': None,
 'image_4': None,
 'image_5': None,
 'img_type': ['Picture'],
 'answer': 'Borago officinalis',
 'topic_difficulty': 5,
 'question_type': 'open-ended',
 'subfield': 'scientific_name_5',
 'metadata': {'author': 'PlantNet',
  'eppo_code': ['BOROF'],
  'event_date': ['2021-04-29T14:34:32Z'],
  'gbif_id': [3949249620],
  'gbif_key': [2926110],
  'kingdom': 'Plantae',
  'language': 'English',
  'license': 'CC BY-SA 4.0',
  'region': ['Europe'],
  'source': 'PlantNet',
  'tag': ['habit'],
  'url': ['https://bs.plantnet.org/image/o/06437fe7f3d1e038e4179e2b8ca30f878c97e317']}}

In [198]:
CABBAGE_doc_to_visual(doc)

[<PIL.Image.Image image mode=RGB size=675x900>]

In [71]:
def CABBAGE_process_results_exact_match(doc, results):
    # I know this is weird, but it's how llava parse it.
    target = doc['answer'].strip().lower()
    pred = results[0].strip().lower()
    if pred == target:
        return {"exact_match": 1.0}
    # pattern: ^[A-Z]\. .*
    if len(pred) >= 2 and pred[0].isupper() and pred[1] == ".":
        result = 1.0 if pred[0] == target else 0.0
        return {"exact_match": result}
    return {"exact_match": 0.0}


In [None]:
GPT_EVAL_MODEL_NAME = "gpt-4o-mini"#config["metadata"]["gpt_eval_model_name"]

API_TYPE = 'openai' #os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
elif API_TYPE == "azure":
    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
    headers = {
        "api-key": API_KEY,
        "Content-Type": "application/json",
    }

In [200]:
dataset[0]

{'id': 'dev_PlantNet_Multi_Identification_1',
 'question': 'What is the scientific name of the plant shown in <image 1>?',
 'options': [],
 'explanation': '',
 'image_1': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=675x900>,
 'image_2': None,
 'image_3': None,
 'image_4': None,
 'image_5': None,
 'img_type': ['Picture'],
 'answer': 'Borago officinalis',
 'topic_difficulty': 5,
 'question_type': 'open-ended',
 'subfield': 'scientific_name_5',
 'metadata': {'author': 'PlantNet',
  'eppo_code': ['BOROF'],
  'event_date': ['2021-04-29T14:34:32Z'],
  'gbif_id': [3949249620],
  'gbif_key': [2926110],
  'kingdom': 'Plantae',
  'language': 'English',
  'license': 'CC BY-SA 4.0',
  'region': ['Europe'],
  'source': 'PlantNet',
  'tag': ['habit'],
  'url': ['https://bs.plantnet.org/image/o/06437fe7f3d1e038e4179e2b8ca30f878c97e317']}}

In [None]:
rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))

def get_eval(content: str, max_tokens: int, retries: int = 5):
    global headers

    messages = [
        {
            "role": "system",
            "content": "You are a helpful and precise agronomy assistant for checking the quality of the answer.",
        },
        {"role": "user", "content": content},
    ]

    payload = {
        "model": GPT_EVAL_MODEL_NAME,
        "messages": messages,
        "temperature": 0.2,
        "max_tokens": max_tokens,
    }

    if API_TYPE == "azure":
        payload.pop("model")

    for attempt in range(retries):
        try:
            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
            response.raise_for_status()
            response_data = response.json()

            content = response_data["choices"][0]["message"]["content"].strip()
            if content != "":
                return content, response_data["model"]
            break  # If successful, break out of the loop

        except Exception as e:
            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
                time.sleep(NUM_SECONDS_TO_SLEEP)
            else:  # If this was the last attempt, log and return empty
                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
                return "", ""
    return "", ""


def parse_score(review):
    score = review.split("\n")[0]
    score = score.replace(",", " ")
    try:
        return float(score)
    except ValueError:
        eval_logger.debug(f"Score not parsed: {review}. Returning -1")
        return -1

def CABBAGE_process_results_gpt_eval(doc, results):
    """
    Args:
        doc: a instance of the eval dataset
        results: [pred]
    Returns:
        a dictionary with key: metric name (in this case coco_bleu), value: metric value
    """
    question = doc.get("question", "")
    ans1 = doc.get("answer", "")
    ans2 = results[0] if results else ""
    
    if doc['question_type']=='multiple-choice':
        exact_match_result = CABBAGE_process_results_exact_match(doc, results)
        review_dict = {'CABBAGE_gpt_eval': {"question": question, "ans1": ans1, "ans2": ans2, "difficulty": doc.get('options_difficulty'), "review": '',  "score": exact_match_result['exact_match']*10, "eval_model": '', "content": ''},
                       'exact_match': exact_match_result['exact_match']}
        return review_dict
    
    try:
        question = doc.get("question", "")
        ans1 = doc.get("answer", "")
        role1= "Expert"
        ans2 = results[0] if results else ""
        role2 = rule_dict.get("role", "user")
        
        captions = doc.get("caption", [])
        # TODO add the context label to the dataset (docs)
        #context = config['lmms_eval_specific_kwargs']['default']['context_prompt'].format(book_title=book_title, chapter_title=chapter_title)
        context=''
        if doc.get('category'):
            context += f'Category: {doc.get('category')}\n'
        if doc.get('context'):
            context += f'Context: {doc.get('context')}\n'
        
        prompt = rule_dict.get("prompt", "")
        content = f"[Context]\n{context}\n" f"[Question]\n{question}\n\n" f"[{role1}]\n{ans1}\n\n[End of {role1}]\n\n" f"[{role2}]\n{ans2}\n\n[End of {role2}]\n\n" f"[System]\n{prompt}\n\n"
        review, model_name = get_eval(content, 1024)
        score = parse_score(review)
    except Exception as e:
        eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
        review = "Failed to Get a Proper Review."
        model_name = "Failed Request"
        score = -1
    if score > 6:
        value = 1.0
    else:
        value = 0.0
        
    review_dict = {'CABBAGE_gpt_eval': {"question": question, "ans1": ans1, "ans2": ans2, "difficulty": doc.get('options_difficulty'), "review": review, "score": score, "eval_model": model_name, "content": content},
                   'exact_match' : value}

    return review_dict

In [135]:
result=CABBAGE_process_results_gpt_eval(dataset[1],['Prova'])

In [140]:
print(result['CABBAGE_gpt_eval']['content'])

[Context]
Category: Nutrition, Health Benefits, and Food Processing
Context: This question is sourced from the book titled 'Peanut', specifically found in the chapter 'Quality and Safety of Foods Derived from Peanut'.


[Question]
Can contaminated grains be used to feed animals?

[Expert]
No. The carcinogenic effects are also observed in birds, mammals, and fish. Therefore, it is necessary to be cautious when feeding livestock with peanut cake.
According to Ordinance No. 7, dated 9/11/1988, from the MAPA (BRASIL, 1988), animal feed ingredients must contain, at most, 50 ppb or 50 µg/kg of aflatoxins (B_1+B_2+G_1+G_2).

[End of Expert]

[Assistant]
Prova

[End of Assistant]

[System]
We would like to request your feedback on the performance of an AI assistant in response to the user question displayed above, which pertains to agronomic knowledge. Please compare the assistant's response to a provided expert response to assess alignment and accuracy.
Rate the assistant's performance based 

In [None]:
def gpt_eval_aggregation(results):
    try:
        scores = []
        for result in results:
            if result["score"] == -1:
                continue
            scores.append(result["score"])

        stats = np.asarray(scores).mean(0).tolist()
        stats = round(stats, 3)
        return stats*10
    except Exception as e:
        eval_logger.info(f"Error in aggregation: {e}")
        return None