In [5]:
import os
import json
import glob
import re
from multiprocessing import Pool
from tqdm import tqdm
import torch
import openai
import itertools
import random
openai.api_key = input("Enter your OpenAI API key: ")

In [6]:
def generate(prompt, max_tokens=256, temperature=0.0, model="gpt-3.5-turbo"):
    if model in ["gpt-3.5-turbo", "gpt-4"]:
        params = {
            "model": model,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "messages": [{"role": "user", "content": prompt}]
        }
        for retry in range(3):
            try:
                return openai.ChatCompletion.create(**params)["choices"][0]["message"]["content"]
            except:
                pass
        raise Exception("Failed to generate")
    
    # For older models, use the completion API with max_tokens=1024
    params = {
        "model": model,
        "max_tokens": min(max_tokens, 1024),
        "temperature": temperature,
        "prompt": prompt
    }
    for retry in range(3):
        try:
            return openai.Completion.create(**params)["choices"][0]["text"]
        except:
            pass

In [7]:
def get_task(task):
    with open(f"bbh/{task}.json") as f:
        data = json.load(f)

    # For dyck languages task, we need remove the spaces in the inputs to avoid unnecessary issues with tokenization
    if task == "dyck_languages":
        for example in data["examples"]:
            desc, input = example["input"].split("Input: ")
            input = input.replace(" ", "")
            example["input"] = f"{desc}Input: {input}"
            example["target"] = example["target"].replace(" ", "")
    
    train = []
    val = []
    test = []
    for index in range(len(data['examples'])):
        sample = {
            'question': data['examples'][index]['input'],
            'answer': data['examples'][index]['target'],
        }
        if index < 5:
            train.append(sample)
        elif index < 10:
            val.append(sample)
        else:
            test.append(sample)
    return train, val, test

In [8]:
tool_maker_prompt = """Please write a generic Python function to solve this type of problems using only standard python libraries. The output of the function can later be converted to the answer (option for multiple choice question). All the function should be wrapped by
```python
```"""

tool_verification_prompt = """Write unit tests to verify the correctness of the function on the questions above using the following format:
```python
{parse the question into the arguments of the function}
{call the function and save the return value in a variable named `ret`}
{for multiple choice question, parse the options}
{convert the return value `ret` to the answer (if the question is a multiple choice question, convert to an option) and save it in a variable named `ans`, otherwise}
{assert ans == the provided answer (if the question is a multiple choice question, assert ans == option)}
```"""

tool_wrapper_prompt = """Success! The function is correct. We will need to summarize the function and use cases up for further use. Please extract the information from the history in the following format:

Here is a function to solve a class of problems:
```python
{the function, including necessary imports}
```

Use cases:
Question: {question (including options)}
Solution:
```python
{parse the question into the arguments of the function}
{call the function and save the return value in a variable named `ret`}
{for multiple choice question, parse the options}
{convert the return value `ret` to the answer (if the question is a multiple choice question, convert to an option) and save it in a variable named `ans`, otherwise}
```
Do this for all the questions in the verification step.
"""


def tool_making(train, val, train_samples=3, val_samples=3, model="gpt-4", temperature=0.3):
    prompt1 = "\n\n".join([f"Question: {sample['question']}\nAnswer: {sample['answer']}" for sample in train[:train_samples]]) + "\n\n" + tool_maker_prompt
    message = [{"role": "user", "content": prompt1}]

    params = {
        "model": model,
        "max_tokens": 2048,
        "temperature": temperature,
        "messages": message
    }

    for retry in range(3):
        try:
            response = openai.ChatCompletion.create(**params)["choices"][0]["message"]["content"]
            message.append({"role": "assistant", "content": response})
            tool = "\n\n".join(re.findall(r"```python\n(.*?)```", response, re.DOTALL))
            exec(tool, globals())
            break
        except Exception as e:
            print("ERROR: failed to generate tool", e)
            message.append({"role": "user", "content": f"Failed to execute the function due to the error: {type(e).__name__} {e}. Please fix it and try again."})

    print("Tool:", message[-1]["content"])
        
    message.append({"role": "assistant", "content": response})
    
    prompt2 = "\n\n".join([f"Question: {sample['question']}\nAnswer: {sample['answer']}" for sample in val[:val_samples]]) + "\n\n" + tool_verification_prompt

    message.append({"role": "user", "content": prompt2})

    params = {
        "model": model,
        "max_tokens": 2048,
        "temperature": temperature,
        "messages": message
    }

    success = False

    for retry in range(3):
        try:
            response = openai.ChatCompletion.create(**params)["choices"][0]["message"]["content"]
            message.append({"role": "assistant", "content": response})
            verification = "\n\n".join(re.findall(r"```python\n(.*?)```", response, re.DOTALL))
            exec(tool+"\n"+verification, globals())
            success = True
            break
        except Exception as e:
            print("ERROR: failed to verify", e)
            message.append({"role": "user", "content": f"Failed to verify the function due to the error: {type(e).__name__} {e}. Please fix it and try again."})

    print("Verification:", message[-1]["content"])

    if success:
        message.append({"role": "user", "content": tool_wrapper_prompt})
        params = {
            "model": model,
            "max_tokens": 2048,
            "temperature": temperature,
            "messages": message
        }
        try:
            response = openai.ChatCompletion.create(**params)["choices"][0]["message"]["content"]
            message.append({"role": "assistant", "content": response})
            print("Wrapper:", response)
        except Exception as e:
            print("ERROR: failed to generate wrapper", e)
    return tool, verification, success, message

In [None]:
task = "schedule_meeting"
#"tracking_shuffled_objects_five_objects"
#"tracking_shuffled_objects_seven_objects"
#"logical_deduction_five_objects"
#"logical_deduction_seven_objects"
#"dyck_languages"
#"word_sorting"
#"chinese_remainder_theorem"
#"schedule_meeting"
model = "gpt-4"
train, val, test = get_task(task)

tool, verification, success, message = tool_making(train, val, train_samples=3, val_samples=3, model=model)

In [140]:
# torch.save(message, f"tools/{task}.pt")
# Dump to json
json.dump(message, open(f"tools/{task}.json", "w"))