In [None]:
import os
import multiprocessing as mp
from typing import Tuple

import openai
from openai import AzureOpenAI, RateLimitError
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from datasets import load_dataset

from prompts import TYPE_1, TYPE_2, TYPE_3, TYPE_4
from logger import logger

from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

import logging
logger = logging.getLogger('Korean_LLM_Benchmark_Logger')

class CustomStrOutputParser(StrOutputParser):
    def parse(self, text: str) -> str:
        response = text.strip().replace('"', "").replace("'", "")
        if response.startswith("A"):
            pred = "A"
        elif response.startswith("B"):
            pred = "B"
        elif response.startswith("C"):
            pred = "C"
        elif response.startswith("D"):
            pred = "D"
        elif response.startswith("E"):
            pred = "E"
        else:
            pred = ""  # Wrong answer

        return pred, response
    
# class CustomOutputParser(BaseOutputParser):
#     def parse(self, text: str):
#         cleaned_text = text.strip()
#         return {"question": cleaned_text}
    
def get_prompt(x) -> str:
    num_choices = len(x["choices"])
    if num_choices == 4:
        if x["paragraph"] != "":  # Use Type 1 Prompt
            return TYPE_1.format(
                CONTEXT=x["paragraph"],
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
            )
        else:
            return TYPE_2.format(
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
            )
    elif num_choices == 5:
        if x["paragraph"] != "":
            return TYPE_3.format(
                CONTEXT=x["paragraph"],
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
                E=x["choices"][4],
            )
        else:
            return TYPE_4.format(
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
                E=x["choices"][4],
            )
    else:
        raise ValueError(f"Invalid number of choices: {num_choices} (ID: {x['id']})")

def get_prompt_template():
    system_prompt = "You are an AI assistant who reads a given question and solves multiple choice questions."
    system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
    human_prompt = [
        {
            "type": "text",
            "text": "{question}"
        },
    ]
    human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

    prompt = ChatPromptTemplate.from_messages(
        [
            system_message_template,
            human_message_template
        ]
    )
    return prompt

def get_answer(x) -> str:
    # 왜 이렇게 .strip() 처리를 해주었는지는 README에 issue 파트 참고 부탁드립니다.
    answer_idx = [xx.strip() for xx in x["choices"]].index(x["answer"].strip())
    if answer_idx == -1:
        raise ValueError(f"Answer not found in choices: {x['answer']} (ID: {x['id']})")
    return chr(0x41 + answer_idx)  # answer_idx = 0 -> answer = "A"


def get_pred(x) -> Tuple[str, str]:

    try:
        response = (
            client.chat.completions.create(
                model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                messages=[{"role": "user", "content": get_prompt(x)}],
                temperature=0.0,
            )
            .choices[0]
            .message.content.strip()
            .replace('"', "")  # Remove double quotes
            .replace("'", "")  # Remove single quotes
        )
    except openai.BadRequestError as e:
        print(e)
        response = "BAD-REQUEST"

    if response.startswith("A"):
        pred = "A"
    elif response.startswith("B"):
        pred = "B"
    elif response.startswith("C"):
        pred = "C"
    elif response.startswith("D"):
        pred = "D"
    elif response.startswith("E"):
        pred = "E"
    else:
        pred = ""  # Wrong answer

    return pred, response


if __name__ == "__main__":
    #MODEL_VERSION = "gpt-4o-mini"
    MODEL_VERSION = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

    client = AzureOpenAI(
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key        = os.getenv("AZURE_OPENAI_API_KEY"),
        api_version    = os.getenv("AZURE_OPENAI_API_VERSION"),
        max_retries    = 3
    )


    llm = AzureChatOpenAI(
        temperature=0, 
        max_tokens=128,
        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
        azure_deployment=MODEL_VERSION,                   
    )

    click_ds = load_dataset("EunsuKim/CLIcK")["train"]

    IS_DEBUG = True
    if IS_DEBUG:
        click_ds = click_ds.select(range(5))

    batch_size = 4
    MAX_RETRIES = 3
    DELAY_INCREMENT = 30

    all_questions = [get_prompt(x) for x in tqdm(click_ds)]
    all_answers = []
    prompt_template = get_prompt_template()
    chain = prompt_template | llm | CustomStrOutputParser()

    with tqdm(total=len(all_questions), desc="Processing Answers") as pbar:
        for i in range(0, len(all_questions), batch_size):
            minibatch = all_questions[i:i+batch_size]

            retries = 0
            while retries <= MAX_RETRIES:
                try:
                    answers = chain.batch(minibatch, {"max_concurrency": batch_size})
                    break  # Exit the retry loop once successful
                except RateLimitError as rate_limit_error:
                    delay = (retries + 1) * DELAY_INCREMENT
                    logger.warning(f"{rate_limit_error}. Retrying in {delay} seconds...")
                    #time.sleep(delay)
                    retries += 1

                    if retries > MAX_RETRIES:
                        logger.error(f"Max retries reached this batch. Skipping to next batch.")
                        break
                except Exception as e:
                    logger.error(f"Error in process_inputs: {e}")
                    break            
            
            all_answers.extend(answers)
            pbar.set_postfix({"current_batch": f"{i//batch_size + 1}/{(len(all_questions) + (batch_size-1))//batch_size}"})
            pbar.update(len(minibatch))

    result = []
    for x in tqdm(click_ds):
        try:
            content = get_prompt(x)
            print(content)
            answer = get_answer(x)

            # for trial in range(3):
            #     pred, response = get_pred(x)
            #     logger.debug(
            #         f"id: {x['id']} ({trial}), answer: {answer}, pred: {pred}, response: {response}"
            #     )
            #     result.append([x["id"], trial, answer, pred, response])
        except ValueError as e:
            logger.error(e)
            continue

    # df = pd.DataFrame(result, columns=["id", "trial", "answer", "pred", "response"])

    # os.makedirs("results", exist_ok=True)
    # df.to_csv(f"results/{MODEL_VERSION}.csv", index=False)


In [1]:
import os
import time
import logging
from typing import Tuple

import openai
from openai import AzureOpenAI, RateLimitError
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from datasets import load_dataset

from prompts import TYPE_1, TYPE_2, TYPE_3, TYPE_4
from langchain_openai import AzureChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

logger = logging.getLogger('Korean_LLM_Benchmark_Logger')

class CustomStrOutputParser(StrOutputParser):
    def parse(self, text: str) -> str:
        response = text.strip().replace('"', "").replace("'", "")
        if response.startswith("A"):
            pred = "A"
        elif response.startswith("B"):
            pred = "B"
        elif response.startswith("C"):
            pred = "C"
        elif response.startswith("D"):
            pred = "D"
        elif response.startswith("E"):
            pred = "E"
        else:
            pred = ""  # Wrong answer

        return pred, response

def get_prompt(x) -> str:
    num_choices = len(x["choices"])
    if num_choices == 4:
        if x["paragraph"] != "":  # Use Type 1 Prompt
            return TYPE_1.format(
                CONTEXT=x["paragraph"],
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
            )
        else:
            return TYPE_2.format(
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
            )
    elif num_choices == 5:
        if x["paragraph"] != "":
            return TYPE_3.format(
                CONTEXT=x["paragraph"],
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
                E=x["choices"][4],
            )
        else:
            return TYPE_4.format(
                QUESTION=x["question"],
                A=x["choices"][0],
                B=x["choices"][1],
                C=x["choices"][2],
                D=x["choices"][3],
                E=x["choices"][4],
            )
    else:
        raise ValueError(f"Invalid number of choices: {num_choices} (ID: {x['id']})")

def get_prompt_template():
    system_prompt = "You are an AI assistant who reads a given question and solves multiple choice questions."
    system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
    human_prompt = [
        {
            "type": "text",
            "text": "{question}"
        },
    ]
    human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

    prompt = ChatPromptTemplate.from_messages(
        [
            system_message_template,
            human_message_template
        ]
    )
    return prompt

def get_answer(x) -> str:
    # 왜 이렇게 .strip() 처리를 해주었는지는 README에 issue 파트 참고 부탁드립니다.
    answer_idx = [xx.strip() for xx in x["choices"]].index(x["answer"].strip())
    if answer_idx == -1:
        raise ValueError(f"Answer not found in choices: {x['answer']} (ID: {x['id']})")
    return chr(0x41 + answer_idx)  # answer_idx = 0 -> answer = "A"


def benchmark(args):

    IS_DEBUG = args.is_debug
    batch_size = args.batch_size
    MAX_RETRIES = args.max_retries
    DELAY_INCREMENT = 30


    llm = AzureChatOpenAI(
        temperature=0, 
        max_tokens=128,
        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),                   
    )

    click_ds = load_dataset("EunsuKim/CLIcK")["train"]

    if IS_DEBUG:
        click_ds = click_ds.select(range(5))

    all_batch = [{"id": x["id"], "question": get_prompt(x), "answer": get_answer(x)} for x in tqdm(click_ds)]
    responses = []
    prompt_template = get_prompt_template()
    chain = prompt_template | llm | CustomStrOutputParser()

    with tqdm(total=len(all_batch), desc="Processing Answers") as pbar:
        for i in range(0, len(all_batch), batch_size):
            mini_batch = all_batch[i:i+batch_size]

            retries = 0
            while retries <= MAX_RETRIES:
                try:
                    preds = chain.batch(mini_batch, {"max_concurrency": batch_size})
                    # If no exception, add questions and answers to all_answers
                    for qna, pred in zip(mini_batch, preds):
                        responses.append({"id": qna["id"], "trial": 0, "answer": qna["answer"], "pred": pred[0], "response": pred[1]})
                    break  # Exit the retry loop once successful
                except RateLimitError as rate_limit_error:
                    delay = (retries + 1) * DELAY_INCREMENT
                    logger.warning(f"{rate_limit_error}. Retrying in {delay} seconds...")
                    time.sleep(delay)
                    retries += 1

                    if retries > MAX_RETRIES:
                        logger.error(f"Max retries reached this batch. Skipping to next batch.")
                        break
                except openai.BadRequestError as e:
                    logger.error(f"BadRequestError: {e}. Skipping this batch.")
                    break
                except Exception as e:
                    logger.error(f"Error in process_inputs: {e}")
                    break            
            
            pbar.set_postfix({"current_batch": f"{i//batch_size + 1}/{(len(all_batch) + (batch_size-1))//batch_size}"})
            pbar.update(len(mini_batch))

import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Options')

    parser.add_argument("--is_debug", type=bool, default=True)
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--max_retries", type=int, default=3)

    args = parser.parse_args()

    print(args)
    benchmark(args)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 5/5 [00:00<00:00, 2928.98it/s]
Processing Answers: 100%|██████████| 5/5 [00:01<00:00,  3.45it/s, current_batch=2/2]


In [2]:
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Options')

    parser.add_argument("--is_debug", type=bool, default=True)
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--max_retries", type=int, default=3)

    args = parser.parse_args()    
    print(args)

usage: ipykernel_launcher.py [-h] [--is_debug IS_DEBUG]
                             [--batch_size BATCH_SIZE]
                             [--max_retries MAX_RETRIES]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/daekeun/Library/Jupyter/runtime/kernel-v2-10936ttYGB3GRGfsN.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
df = pd.DataFrame(responses)
os.makedirs("results", exist_ok=True)
MODEL_VERSION = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
df.to_csv(f"results/{MODEL_VERSION}.csv", index=False)
