In [1]:
import os
with open("./oai-config/OAI_API_KEY") as f:
    os.environ['OPENAI_API_KEY'] = f.read()

In [2]:
import autogen

endpoint_list = autogen.config_list_openai_aoai()
endpoint_list

  from .autonotebook import tqdm as notebook_tqdm


[{'api_key': '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'}]

In [3]:
config_list = autogen.config_list_from_json(
    env_or_file="OAI_CONFIG_LIST",
    filter_dict={
        "model": {
            "gpt-3.5-turbo",
            "gpt-3.5-turbo-16k",
            "gpt-3.5-turbo-0301",
            "chatgpt-35-turbo-0301",
            "gpt-35-turbo-v0301",
            "gpt",
        },
    },
)
config_list

[{'model': 'gpt-4',
  'api_key': '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'},
 {'model': 'gpt-3.5-turbo-0613',
  'api_key': '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'}]

## Load dataset

First, we load the humaneval dataset. The dataset contains 164 examples. We use the first 20 for tuning the generation hyperparameters and the remaining for evaluation. In each example, the "prompt" is the prompt string for eliciting the code generation (renamed into "definition"), "test" is the Python code for unit test for the example, and "entry_point" is the function name to be tested.

In [4]:
import datasets

seed = 40
data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
n_tune_data = 20
tune_data = [
    {
        "definition": data[x]["prompt"],
        "test": data[x]["test"],
        "entry_point": data[x]["entry_point"],
    }
    for x in range(n_tune_data)
]
test_data = [
    {
        "definition": data[x]["prompt"],
        "test": data[x]["test"],
        "entry_point": data[x]["entry_point"],
    }
    for x in range(n_tune_data, len(data))
]


In [5]:
print(tune_data[1]["definition"])
print(tune_data[1]["test"])


def sorted_list_sum(lst):
    """Write a function that accepts a list of strings as a parameter,
    deletes the strings that have odd lengths from it,
    and returns the resulted list with a sorted order,
    The list is always a list of strings and never an array of numbers,
    and it may contain duplicates.
    The order of the list should be ascending by length of each word, and you
    should return the list sorted by that rule.
    If two words have the same length, sort the list alphabetically.
    The function should return a list of strings in sorted order.
    You may assume that all words will have the same length.
    For example:
    assert list_sort(["aa", "a", "aaa"]) => ["aa"]
    assert list_sort(["ab", "a", "aaa", "cd"]) => ["ab", "cd"]
    """

def check(candidate):

    # Check some simple cases
    assert candidate(["aa", "a", "aaa"]) == ["aa"]
    assert candidate(["school", "AI", "asdf", "b"]) == ["AI", "asdf", "school"]
    assert candidate(["d", "b", "c", "a

In [6]:
%pip install "pyautogen[blendsearch]~=0.1.0"

Note: you may need to restart the kernel to use updated packages.


In [7]:
from functools import partial

eval_with_generated_assertions = partial(
    autogen.code_utils.eval_function_completions,
    assertions=partial(autogen.code_utils.generate_assertions, config_list=config_list),
    use_docker=False,
    # Please set use_docker=True if docker is available to run the generated code.
    # Using docker is safer than running the generated code directly.
)


In [8]:
autogen.Completion.set_cache(seed)

In [9]:
config, analysis = autogen.Completion.tune(
    data=tune_data,  # the data for tuning
    metric="success",  # the metric to optimize
    mode="max",  # the optimization mode
    eval_func=eval_with_generated_assertions,  # the evaluation function to return the success metrics
    # log_file_name="logs/humaneval.log",  # the log file name
    inference_budget=0.05,  # the inference budget (dollar per instance)
    optimization_budget=1,  # the optimization budget (dollar in total)
    # num_samples can further limit the number of trials for different hyperparameter configurations;
    # -1 means decided by the optimization budget only
    num_samples=-1,
    prompt=[
        "{definition}",
        "# Python 3{definition}",
        "Complete the following Python function:{definition}",
    ],  # the prompt templates to choose from
    stop=[["\nclass", "\ndef", "\nif", "\nprint"], None],  # the stop sequences
    config_list=endpoint_list,  # optional: a list of endpoints to use
    allow_format_str_template=True,  # whether to allow format string template
)


INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune
[32m[I 2023-12-04 10:05:16,131][0m A new study created in memory with name: optuna[0m
INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune
[32m[I 2023-12-04 10:05:16,135][0m A new study created in memory with name: optuna[0m


[flaml.tune.tune: 12-04 10:05:16] {805} INFO - trial 1 config: {'prompt': 1, 'stop': 0, 'allow_format_str_template': True, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}
[flaml.tune.tune: 12-04 10:05:29] {197} INFO - result: {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.0, 'gen_cost': 0.0038099999999999996, 'assertions': "I'm sorry, but I can't provide the assertions without the actual function definition and examples in the docstring you mentioned. Please provide the function signature and docstring so I can help you better.", 'total_cost': 0.010623200000000001, 'cost': 0.010623200000000001, 'inference_cost': 0.00025844, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'allow_format_str_template': True, 'subspace': {'model': 'text-ada-001', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/allow_format_s

KeyboardInterrupt: 

In [None]:
print("optimized config", config)
print("best result on tuning data", analysis.best_result)

optimized config {'prompt': '# Python 3{definition}', 'stop': ['\nclass', '\ndef', '\nif', '\nprint'], 'allow_format_str_template': True, 'model': 'text-davinci-003', 'max_tokens': 148, 'n': 27, 'top_p': 0.755486898036596}
best result on tuning data {'index_selected': 26.0, 'succeed_assertions': 0.0, 'success': 0.5, 'gen_cost': 0.002910000000000001, 'assertions': "Without an actual function definition and docstring, it's impossible for me to write related assertions. Please provide the necessary information.", 'total_cost': 0.8951251999999998, 'cost': 0.8645799999999999, 'inference_cost': 0.042286000000000004, 'training_iteration': 0, 'config': {'prompt': 1, 'stop': 0, 'allow_format_str_template': True, 'subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_or_top_p': {'top_p': 0.755486898036596}, 'n': 27}}, 'config/prompt': 1, 'config/stop': 0, 'config/allow_format_str_template': True, 'config/subspace': {'model': 'text-davinci-003', 'max_tokens': 148, 'temperature_

In [None]:
response = autogen.Completion.create(context=tune_data[1], config_list=endpoint_list, **config)
print(response)
print(eval_with_generated_assertions(autogen.Completion.extract_text(response), **tune_data[1]))


{
  "id": "cmpl-8RzM4MDLIJgu7h4gGimsNzDZ0ez7r",
  "object": "text_completion",
  "created": 1701681396,
  "model": "text-davinci-003",
  "choices": [
    {
      "text": "    result = []\n    for i in range(len(game)):\n        result.append(abs(game[i] - guess[i]))\n    return result",
      "index": 0,
      "logprobs": null,
      "finish_reason": "stop"
    },
    {
      "text": "    results = []\n    for i in range(len(game)):\n        if game[i] == guess[i]:\n            results.append(0)\n        else:\n            results.append(abs(game[i] - guess[i]))\n    return results",
      "index": 1,
      "logprobs": null,
      "finish_reason": "stop"
    },
    {
      "text": "    results = []\n    for i in range(len(game)):\n        if game[i] == guess[i]:\n            results.append(0)\n        else:\n            results.append(abs(game[i]-guess[i]))\n    return results",
      "index": 2,
      "logprobs": null,
      "finish_reason": "stop"
    },
    {
      "text": "    # de