<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/mmlu_eval_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building an MMLU Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result.

In [1]:
!git clone https://github.com/openai/evals.git

Cloning into 'evals'...
remote: Enumerating objects: 7848, done.[K
remote: Counting objects: 100% (325/325), done.[K
remote: Compressing objects: 100% (217/217), done.[K
remote: Total 7848 (delta 119), reused 275 (delta 100), pack-reused 7523[K
Receiving objects: 100% (7848/7848), 6.56 MiB | 14.34 MiB/s, done.
Resolving deltas: 100% (3937/3937), done.
Updating files: 100% (1738/1738), done.
Filtering content: 100% (670/670), 785.10 MiB | 30.46 MiB/s, done.


In [None]:
# Install, and download MMLU if you haven't already
%cd /content/evals/
%pip install -e .

In [2]:


!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar
!tar -xf data.tar
data_path = "data"

Obtaining file:///
[31mERROR: file:/// does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  158M  100  158M    0     0  1662k      0  0:01:37  0:01:37 --:--:-- 2135k


In [26]:
import pandas as pd
import os

# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "/content/evals/registry")

In [27]:
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models

choices = ["A", "B", "C", "D"]
sys_msg = "The following are multiple choice questions (with answers) about {}."
def create_chat_prompt(sys_msg, question, answers, subject):
    user_prompt = f"{question}\n" + "\n".join([f"{choice}. {answer}" for choice, answer in zip(choices, answers)]) + "\nAnswer:"
    return [
        {"role": "system", "content": sys_msg.format(subject)},
        {"role": "user", "content": user_prompt}
    ]

def create_chat_example(question, answers, correct_answer):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    user_prompt = f"{question}\n" + "\n".join([f"{choice}. {answer}" for choice, answer in zip(choices, answers)]) + "\nAnswer:"
    return [
        {"role": "system", "content": user_prompt, "name": "example_user"},
        {"role": "system", "content": correct_answer, "name": "example_assistant"},
    ]

In [28]:
data_path='/content/data'

In [29]:
import yaml
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(data_path, "test")) if "_test.csv" in f])

registry_yaml = {}

for subject in subjects:
    subject_path = os.path.join(registry_path, "data", "mmlu", subject)
    os.makedirs(subject_path, exist_ok=True)

    # Create few-shot prompts
    dev_df = pd.read_csv(os.path.join(data_path, "dev", subject + "_dev.csv"), names=("Question", "A", "B", "C", "D", "Answer"))
    dev_df["sample"] = dev_df.apply(lambda x: create_chat_example(x["Question"], x[["A", "B", "C", "D"]], x["Answer"]), axis=1)
    few_shot_path = os.path.join(subject_path, "few_shot.jsonl")
    dev_df[["sample"]].to_json(few_shot_path, lines=True, orient="records")

    # Create test prompts and ideal completions
    test_df = pd.read_csv(os.path.join(data_path, "test", subject + "_test.csv"), names=("Question", "A", "B", "C", "D", "Answer"))
    test_df["input"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x["Question"], x[["A", "B", "C", "D"]], subject), axis=1)
    test_df["ideal"] = test_df.Answer
    samples_path = os.path.join(subject_path, "samples.jsonl")
    test_df[["input", "ideal"]].to_json(samples_path, lines=True, orient="records")

    eval_id = f"match_mmlu_{subject}"

    registry_yaml[eval_id] = {
        "id": f"{eval_id}.test.v1",
        "metrics": ["accuracy"]
    }
    registry_yaml[f"{eval_id}.test.v1"] = {
        "class": "evals.elsuite.basic.match:Match",
        "args": {
            "samples_jsonl": samples_path,
            "few_shot_jsonl": few_shot_path,
            "num_few_shot": 4,
        }
    }
registry_path = '/content'
with open(os.path.join(registry_path, "evals", "mmlu.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

In [30]:
!pip install colab-env --upgrade -q
!pip install openai -q
import colab_env
import os
import openai
from openai import OpenAI
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [None]:
from openai import OpenAI
client = OpenAI()

modellist=client.models.list()
modellist.data

In [38]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo match_mmlu_anatomy

[2024-05-28 05:35:31,959] [registry.py:271] Loading registry from /content/evals/evals/registry/evals
[2024-05-28 05:35:32,848] [registry.py:271] Loading registry from /root/.evals/evals
[2024-05-28 05:35:32,860] [registry.py:160] eval 'match_mmlu_anatomy' not found. Closest matches: ['mmlu-anatomy']
Traceback (most recent call last):
  File "/usr/local/bin/oaieval", line 8, in <module>
    sys.exit(main())
  File "/content/evals/evals/cli/oaieval.py", line 304, in main
    run(args)
  File "/content/evals/evals/cli/oaieval.py", line 133, in run
    eval_spec is not None
AssertionError: Eval match_mmlu_anatomy not found. Available: ['2d_movement', '2d_movement.dev.v0', '3d_globe_movement', '3d_globe_movement.dev.v0', '3d_object_manipulation', '3d_object_manipulation.dev.v0', 'Chinese_character_riddles', 'Chinese_character_riddles.dev.v0', 'GPT-model-text-detection', 'GPT-model-text-detection.dev.v0', 'Unfamiliar-Chinese-Character', 'Unfamiliar-Chinese-Character.dev.v0', 'ab', 'ab.dev.v

In [37]:
# How to process the log events generated by oaieval
events = "/tmp/evallogs/{log_name}"

with open(events, "r") as f:
    events_df = pd.read_json(f, lines=True)

matches_df = events_df[events_df.type == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))
matches_df.correct.value_counts().plot.bar(title="Correctness of generated answers", xlabel="Correctness", ylabel="Count")

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/evallogs/{log_name}'

In [19]:
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)

NameError: name 'events_df' is not defined