In [2]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', None)
registry_pth = os.path.join("..", "evals", "registry")

In [3]:
def create_chat_prompt(word):
    return [
        {"role": "system", "content": "Please state the number of syllables in the input word. Reply only with a number and nothing else."}, 
        {"role": "user", "content": word}
    ]

# Read the file and create DataFrames
for i in range(5, 7):
    df = pd.read_csv('syllables_dataset/' + str(i) + '_syllables_sorted_by_prevalence.txt', header=None, names=["word"], lineterminator='\n')
    df["input"], df["ideal"] = df["word"].apply(create_chat_prompt), str(i)
    df = df[["input", "ideal"]]
    df.to_json(os.path.join(registry_pth, "data/syllables_long_words/" + str(i) + "_syllables.jsonl"), orient="records", lines=True)
    df.head()


In [4]:

eval_yaml = """
syllables_long_words:
  id: syllables.dev.v1
  metrics: [accuracy]
syllables.dev.v1:
  class: evals.elsuite.basic.match:Match
  args:
    samples_jsonl: syllables_long_words/long_word_samples.jsonl  
""".strip()
with open(os.path.join(registry_pth, "evals", "syllables_long_words.yaml"), "wb") as f:
    # Encode the text and replace CRLF with LF, because this may be run on Windows
    encoded_text = eval_yaml.encode('utf-8').replace(b'\r\n', b'\n')
    
    # Write the encoded text to the file
    f.write(encoded_text)


The current evals module is used to run the oaieval task. Within your virtual environment, create a file in site-packages with extension .pth which contains the full path to the evals python code. For example, module.pth with line C:\dev\play\evals

In [5]:
!py ../evals/cli/oaieval.py gpt-3.5-turbo syllables_long_words --max_samples 100

[2023-05-23 17:04:58,324] [registry.py:249] Loading registry from C:\dev\play\evals\evals\registry\evals
[2023-05-23 17:04:59,042] [registry.py:249] Loading registry from C:\Users\ekane\.evals\evals
[2023-05-23 17:04:59,047] [oaieval.py:110] [1;35mRun started: 2305232104592TSYCYBQ[0m
[2023-05-23 17:04:59,052] [data.py:75] Fetching syllables_long_words/long_word_samples.jsonl
[2023-05-23 17:04:59,063] [eval.py:34] Evaluating 100 samples
[2023-05-23 17:04:59,076] [eval.py:153] Running in threaded mode with 10 threads!

  0%|          | 0/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:00<01:32,  1.07it/s]
  5%|▌         | 5/100 [00:01<00:15,  6.12it/s]
  8%|▊         | 8/100 [00:01<00:09,  9.55it/s]
 11%|█         | 11/100 [00:01<00:10,  8.59it/s]
 13%|█▎        | 13/100 [00:01<00:10,  8.16it/s]
 15%|█▌        | 15/100 [00:02<00:09,  8.81it/s]
 17%|█▋        | 17/100 [00:02<00:09,  8.38it/s]
 20%|██        | 20/100 [00:02<00:08,  9.52it/s]
 22%|██▏       | 22/100 [00:02<00:08,  9.65it/