<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/MMLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/FranxYao/chain-of-thought-hub/tree/main

# Environment Preparations

In [None]:
!git clone https://github.com/FranxYao/chain-of-thought-hub.git

In [None]:
!pip install -q transformers
!pip install -q tqdm
!pip install -q pandas
!pip install -q tensor_parallel
!pip install -q argparse
!pip install -q einops
!pip install -q accelerate
#!pip install -q torch==2.0.0+cu118
!pip install -q torch

!pip install colab-env --upgrade -q
!pip install openai -q

!pip install datasets -q
!pip install utils -q

In [19]:
import colab_env
import os
import openai
import IPython
import pytz
from datetime import datetime
import json
from pathlib import Path

In [None]:
# datetime object containing current date and time
newYorkTz = pytz.timezone("America/New_York")
now = datetime.now(newYorkTz)
#print("now =", now)

# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
#print("date and time =", dt_string)

openai.api_key = os.getenv("OPENAI_API_KEY")
from openai import OpenAI

#client = OpenAI()
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

print()
print('TEST - OPENAI  - BY FRANK MORALES - %s'%dt_string)
print()

In [22]:
import os
openai.api_key = os.getenv("OPENAI_API_KEY")
from openai import OpenAI

In [None]:
!mkdir /content/outputs
API_KEY=OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

# GPT-3.5-Turbo
#!python /content/chain-of-thought-hub/MMLU/run_mmlu_gpt_3.5_turbo.py --api_key=${API_KEY}

# Claude-v1.3
#!python /content/chain-of-thought-hub/MMLU/run_mmlu_claude.py  --api_key=${API_KEY} --engine=claude-v1.3


#  Evaluating GPT-3.5 turbo model on MMLU

In [None]:
# evaluating GPT-3.5 turbo model on MMLU

import openai
import re
import time
import json

import numpy as np

from tqdm import tqdm
from datasets import load_dataset
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed
from utils import *

# parse arguments
#import argparse
#parser = argparse.ArgumentParser()
#parser.add_argument(client.api_key)
#parser.add_argument('--api_key', type=str, default='sk')
#args = parser.parse_args()
#print(args.echo)
args=client.api_key

TASKS = [
        'abstract_algebra',
        'anatomy',
        'astronomy',
        'business_ethics',
        'clinical_knowledge',
        'college_biology',
        'college_chemistry',
        'college_computer_science',
        'college_mathematics',
        'college_medicine',
        'college_physics',
        'computer_security',
        'conceptual_physics',
        'econometrics',
        'electrical_engineering',
        'elementary_mathematics',
        'formal_logic',
        'global_facts',
        'high_school_biology',
        'high_school_chemistry',
        'high_school_computer_science',
        'high_school_european_history',
        'high_school_geography',
        'high_school_government_and_politics',
        'high_school_macroeconomics',
        'high_school_mathematics',
        'high_school_microeconomics',
        'high_school_physics',
        'high_school_psychology',
        'high_school_statistics',
        'high_school_us_history',
        'high_school_world_history',
        'human_aging',
        'human_sexuality',
        'international_law',
        'jurisprudence',
        'logical_fallacies',
        'machine_learning',
        'management',
        'marketing',
        'medical_genetics',
        'miscellaneous',
        'moral_disputes',
        'moral_scenarios',
        'nutrition',
        'philosophy',
        'prehistory',
        'professional_accounting',
        'professional_law',
        'professional_medicine',
        'professional_psychology',
        'public_relations',
        'security_studies',
        'sociology',
        'us_foreign_policy',
        'virology',
        'world_religions']

@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

def main(tasks=TASKS):
    openai.api_key = openai.api_key
    mmlu_prompt = json.load(open('/content/chain-of-thought-hub/MMLU/lib_prompt/mmlu-cot.json'))
    for task in tasks:
        print('Testing %s ...' % task)
        i = 0
        acc = 0
        task_data = load_dataset("lukaemon/mmlu", task, trust_remote_code=True)
        with open('/content/outputs/test_gpt_3.5_turbo_%s.txt' % task, 'w') as fd:
            for q_ in tqdm(task_data['test'], total=len(task_data['test'])):
                q = q_['input'] + '\n'
                for letter in ['A', 'B', 'C', 'D']:
                    q += '(' + letter + ') ' + q_[letter] + ' '
                q += "\nA: Let's think step by step."

                prompt_q = mmlu_prompt[task] + "\n\n" + q

                response = completion_with_backoff(
                    model="gpt-3.5-turbo",
                    messages=[
                            {"role": "system", "content": "Follow the given examples and answer the question."},
                            {"role": "user", "content": prompt_q},
                        ],
                    temperature=0
                    )
                ans_model = response['choices'][0]['message']['content']
                ans_, residual = extract_ans(ans_model)

                a = q_['target']
                fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
                i += 1

                if(test_answer_mmlu_(ans_, a)): acc += 1
            print('%s acc %.4f' % (task, acc / len(task_data['test'])))
    return

if __name__ == '__main__':
    main()

# Evaluating CLAUDE3 model on MMLU

In [None]:
!pip install anthropic -q

import anthropic
import os
import colab_env
import json

In [26]:
api_key = os.environ["CLAUDE3_API_KEY"]
model="claude-3-opus-20240229"

client = anthropic.Anthropic(
    api_key=api_key,
)

In [27]:
message = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "Hello, Claude"}
    ]
)

In [28]:
print(message.content[0].text)

Hello! It's nice to meet you. How are you doing today?


In [None]:
# evaluating Claude model on converted MMLU to Claude prompt,
# with the option of single or multiple rounds of questions

import anthropic
import json
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from utils import *


TASKS = [
        'abstract_algebra',
        'anatomy',
        'astronomy',
        'business_ethics',
        'clinical_knowledge',
        'college_biology',
        'college_chemistry',
        'college_computer_science',
        'college_mathematics',
        'college_medicine',
        'college_physics',
        'computer_security',
        'conceptual_physics',
        'econometrics',
        'electrical_engineering',
        'elementary_mathematics',
        'formal_logic',
        'global_facts',
        'high_school_biology',
        'high_school_chemistry',
        'high_school_computer_science',
        'high_school_european_history',
        'high_school_geography',
        'high_school_government_and_politics',
        'high_school_macroeconomics',
        'high_school_mathematics',
        'high_school_microeconomics',
        'high_school_physics',
        'high_school_psychology',
        'high_school_statistics',
        'high_school_us_history',
        'high_school_world_history',
        'human_aging',
        'human_sexuality',
        'international_law',
        'jurisprudence',
        'logical_fallacies',
        'machine_learning',
        'management',
        'marketing',
        'medical_genetics',
        'miscellaneous',
        'moral_disputes',
        'moral_scenarios',
        'nutrition',
        'philosophy',
        'prehistory',
        'professional_accounting',
        'professional_law',
        'professional_medicine',
        'professional_psychology',
        'public_relations',
        'security_studies',
        'sociology',
        'us_foreign_policy',
        'virology',
        'world_religions']


def get_response(**kwargs):
    #client = anthropic.Client(args.anthropic_key)
    client = anthropic.Anthropic(
    api_key=api_key,)
    #response = client.completions(**kwargs)
    response = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "Hello, Claude"}
    ]
)
    return response

model="claude-3-opus-20240229"

def main(tasks=TASKS):
    for task in tasks:
        print('Testing %s ...' % task)
        i = 0
        acc = 0
        task_data = load_dataset("lukaemon/mmlu", task, trust_remote_code=True)

        with open('/content/outputs/test_%s_%s.txt' % (model, task), 'w') as fd:
            for q_ in tqdm(task_data['test'], total=len(task_data['test'])):
                q = 'Q: '+ q_['input'] + '\n'
                task_mod = task.replace('_', ' ')

                # add test prompt based on subject matter
                if task_mod in ["business ethics",
                                "computer security",
                                "marketing"]:
                    q += "Which one of the four choices completes the question correctly, (A), (B), (C) or (D)?" + "\nChoices:" + "\n"
                elif task_mod in ["college medicine",
                                    "high school biology",
                                    "high school european history",
                                    "high school geography",
                                    "high school government and politics",
                                    "high school macroeconomics",
                                    "moral disputes"]:
                    q += "Choices:"
                elif task_mod == "college physics":
                    q += "Which one of the four choices is correct about the question, (A), (B), (C) or (D)?" + "\nChoices:" + "\n"
                else:
                    q += "Which one of the four choices is correct, (A), (B), (C) or (D)?" + "\nChoices:" + "\n"

                for letter in ['A', 'B', 'C', 'D']:
                    q += '(' + letter + ') ' + q_[letter] + ' '

                # add step-by-step prompt
                q += "\nLet's think step by step."
                q += "\nA:"

                # convert to Claude prompt
                # load converted prompt based on prompt type

                #model="claude-3-opus-20240229",
                mmlu_prompt = json.load(open('/content/chain-of-thought-hub/MMLU/lib_prompt/mmlu-cot-claude-single.json'))
                prompt_q = mmlu_prompt[task] + "\n\n" + q
                claude_prompt = anthropic.HUMAN_PROMPT + prompt_q + anthropic.AI_PROMPT

                #if args.prompt_type == 'single':
                #    mmlu_prompt = json.load(open('/content/chain-of-thought-hub/MMLU/lib_prompt/mmlu-cot-claude-single.json'))
                #    prompt_q = mmlu_prompt[task] + "\n\n" + q
                #    claude_prompt = anthropic.HUMAN_PROMPT + prompt_q + anthropic.AI_PROMPT
                #elif args.prompt_type == 'multiple':
                #    mmlu_prompt = json.load(open('/content/chain-of-thought-hub/MMLU/lib_prompt/mmlu-cot-claude-multiple.json'))
                #    prompt_q = mmlu_prompt[task] + "\n\n" + anthropic.HUMAN_PROMPT + "\n" + q
                #    claude_prompt = prompt_q + anthropic.AI_PROMPT
                #else:
                #     raise ValueError('Prompt type not supported')

                # obtain Claude response
                #response = get_response(
                #    model=model,
                #    prompt=claude_prompt,
                #    stop_sequences=[anthropic.HUMAN_PROMPT],
                #    max_tokens_to_sample=300,
                #    temperature=0
                #    )

                ## CLAUDE3 API by FRANK MORALES MARCH 30, 2023
                response = client.messages.create(
                      model="claude-3-opus-20240229",
                      max_tokens=1024,
                      messages=[
                        {"role": "user", "content":claude_prompt }
                               ]
                                                )
                #print(response)
                # clean response
                #ans_ = response['completion'].strip()
                ans_ = response.content[0].text
                a = q_['target']
                fd.write('%s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
                i += 1

                # check answer
                if(test_answer_mmlu_claude_instant(ans_, a)): acc += 1
            print('%s acc %.4f' % (task, acc / len(task_data['test'])))

        # write accuracy to file
        with open('/content/outputs/test_%s_%s_acc.txt' % (args.engine, args.prompt_type), 'a') as fd:
            fd.write('%s acc %.4f\n' % (task, acc / len(task_data['test'])))

    # write average accuracy to file
    acc_list = []
    with open('/content/outputs/test_%s_%s_acc.txt' % (args.engine, args.prompt_type), 'r') as fd2:
        for line in fd2:
            acc_list.append(float(line.split(' ')[2]))
    with open('/content/outputs/test_%s_%s_acc.txt' % (args.engine, args.prompt_type), 'a') as fd:
        fd.write('Average acc %.4f\n' % (np.mean(acc_list)))

    return

if __name__ == '__main__':
    main()

Testing abstract_algebra ...


 54%|█████▍    | 54/100 [14:19<13:52, 18.09s/it]