In [1]:
%%time
import pal
import json
import random
import io
import pandas as pd
from pal.prompt import math_prompts, colored_object_prompt, penguin_prompt, date_understanding_prompt, algorithmic_prompt
from tqdm.notebook import tqdm

CPU times: user 1.07 s, sys: 183 ms, total: 1.25 s
Wall time: 1.71 s


In [2]:
create_function_prompt = """What other action should this library have? Remember: First, decide on an appropriate action. Write instructions for this action, by putting it in between INSTRUCTIONS_START and INSTRUCTIONS_END. Then, write python code defining a function that takes in the string 'action' as input and returns the desired output. Only write one additional action. I need only basic mathematical functions that are necessary to solve math problems. If you think there are no further actions that need to be written, write 'FINISHED'. Do not repeat actions."""
question_prompt = """Here are 4 math questions:
(1) Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?
(2) James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?
(3) Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?
(4) Carla is downloading a 200 GB file. Normally she can download 2 GB per minute, but 40% of the way through the download, Windows forces a restart to install updates, which takes 20 minutes. Then Carla has to restart the download from the beginning. How long does it take to download the file?
Write actions that you will need to solve these questions. Use the above example of ADD for reference. Remember: First, decide on an appropriate action. There are 4 basic math operations: ADD, SUBTRACT, MULTIPLY, and DIVIDE. For each of these operations, write actions for them. Write instructions for this action, by putting it in between INSTRUCTIONS_START and INSTRUCTIONS_END. Then, write python code defining a function that takes in the string 'action' as input and returns the desired output. Only write one additional action. If you think there are no further actions that need to be written, write 'FINISHED'.
"""
create_function_example = [
    {"role": "user", "content": "I want to create an Arithmetic library that contains actions. First, decide on an appropriate action. Write instructions for this action, by putting it in between INSTRUCTIONS_START and INSTRUCTIONS_END. Then, write python code defining a function that takes in the string 'action' as input and returns the desired output."},
    {"role": "assistant", "content": """I will create an action called ADD, which takes in any amount of numbers, separated by semicolons, and returns their sum.
I first write an instruction for this.
INSTRUCTIONS_START
ADD_START[a1;a2;...;an]ADD_END: ADD takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their sum. For example, ADD_START[5;3;2]ADD_END would return the sum 5+3+2=10.]
INSTRUCTIONS_END 
Now I write python code to define a function ADD that takes in a string called "action", extracts the expression inside the ADD_START[]ADD_END, and returns the correct value.
```python
def ADD(action):
    expression = action[action.find('ADD_START[') + len('ADD_START['):action.find(']ADD_END')]
    expression = expression.split(';')
    for i in range(len(expression)):
        expression[i] = float(expression[i])
    total_sum = 0
    for number in expression:
        total_sum += number
    return total_sum
```
"""},
    {"role": "user", "content": create_function_prompt}
]
question_example = [
    {"role": "user", "content": "I want to create an Arithmetic library that contains actions. First, decide on an appropriate action. Write instructions for this action, by putting it in between INSTRUCTIONS_START and INSTRUCTIONS_END. Then, write python code defining a function that takes in the string 'action' as input and returns the desired output."},
    {"role": "assistant", "content": """I will create an action called ADD, which takes in any amount of numbers, separated by semicolons, and returns their sum.
I first write an instruction for this.
INSTRUCTIONS_START
ADD_START[a1;a2;...;an]ADD_END: ADD takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their sum. For example, ADD[5;3;2] would return the sum 5+3+2=10.]
INSTRUCTIONS_END 
Now I write python code to define a function ADD that takes in a string called "action", extracts the expression inside the ADD_START[]ADD_END, and returns the correct value.
```python
def ADD(action):
    expression = action[action.find('ADD_START[') + len('ADD_START['):action.find(']ADD_END')]
    expression = expression.split(';')
    for i in range(len(expression)):
        expression[i] = float(expression[i])
    total_sum = 0
    for number in expression:
        total_sum += number
    return total_sum
```
"""},
    {"role": "user", "content": question_prompt}
]

In [3]:
interface = pal.interface.ProgramInterface(
  model='gpt-4o-mini',
  stop='\n\n\n', # stop generation str for Codex API
  get_answer_expr='solution()' # python expression evaluated after generated code to obtain answer 
)
max_count = 10
libraryFunctions = {
    "ADD": [
        "ADD_START[a1;a2;...;an]ADD_END: ADD takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their sum. For example, ADD[5;3;2] would return the sum 5+3+2=10.", 
        """
def ADD(action):
    expression = action[action.find('ADD_START[') + len('ADD_START['):action.find(']ADD_END')]
    expression = expression.split(';')
    for i in range(len(expression)):
        expression[i] = float(expression[i])
    total_sum = 0
    for number in expression:
        total_sum += number
    return total_sum""".strip()
    ]
}
all_instructions = ["ADD_START[a1;a2;...;an]ADD_END: ADD takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their sum. For example, ADD[5;3;2] would return the sum 5+3+2=10."]

for i in range(max_count):
    messages = []
    for message in create_function_example:
        messages.append(message)
    if len(all_instructions) != 0:
        new_prompt = "You have already made these actions:\n"
        for i in range(len(all_instructions)):
            new_prompt += '(' + str(i + 1) + ") "
            new_prompt += all_instructions[i] 
            new_prompt += '\n'
        #print("NEW PROMPT")
        #print(new_prompt)
        #print("---------------------------------------")
        messages.append({"role": "user", "content": new_prompt})
    gens = interface.generate_history(messages=messages, max_tokens=1024)
    output = gens[0]
    print("OUTPUT:")
    print(output)
    print("######################################")
    if "FINISHED" in output:
        break
    curInstruct = output[output.find("INSTRUCTIONS_START") + len("INSTRUCTIONS_START") + 1:output.find("INSTRUCTIONS_END")].strip()
    all_instructions.append(curInstruct)
    pyStart = output.find("```python") + len("```python")
    pyEnd = None
    for i in range(len(output)):
        if output[i:i + 3] == "```":
            pyEnd = i
    pyCode = output[pyStart:pyEnd]
    libraryFunctions[pyCode[pyCode.find("def ") + len("def "):pyCode.find('(')].strip()] = [curInstruct, output[pyStart:pyEnd]]
    messages.append({"role": "assistant", "content": output})
all_instructions.append("FINISH_START[answer]FINISH_END: When you finish a question, write your answer as FINISH_START[answer]FINISH_END and terminate your response.")
"""
json_list = None
max_count = 1
counter = 0
responses = []
with open('datasets/reasoning_about_colored_objects.json', 'r') as json_file:
    json_list = json.load(json_file)
json_list = json_list["examples"]
jsonSamples = random.sample(range(0, len(json_list)), max_count)
print(jsonSamples)
for json_pos in tqdm(jsonSamples, desc="Going through Colored Object questions"):
    result = json_list[json_pos]
    question = result["input"]
    target = result["target_scores"]
    curResponse = {"question": question, "target": target}
    messages = []
"""

OUTPUT:
I will create an action called SUBTRACT, which takes in two numbers and returns their difference.

INSTRUCTIONS_START
SUBTRACT_START[a;b]SUBTRACT_END: SUBTRACT takes in two numbers, a and b, and returns the result of a - b. For example, SUBTRACT_START[10;4]SUBTRACT_END would return 10 - 4 = 6.
INSTRUCTIONS_END

Now I will write Python code to define a function that implements this action.

```python
def SUBTRACT(action):
    expression = action[action.find('SUBTRACT_START[') + len('SUBTRACT_START['):action.find(']SUBTRACT_END')]
    a, b = map(float, expression.split(';'))
    return a - b
```
######################################
OUTPUT:
I will create an action called MULTIPLY, which takes in any amount of numbers, separated by semicolons, and returns their product.

INSTRUCTIONS_START
MULTIPLY_START[a1;a2;...;an]MULTIPLY_END: MULTIPLY takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their product. For example, MULTIPLY_START[5;3;2]MULTIPLY_END would 

'\njson_list = None\nmax_count = 1\ncounter = 0\nresponses = []\nwith open(\'datasets/reasoning_about_colored_objects.json\', \'r\') as json_file:\n    json_list = json.load(json_file)\njson_list = json_list["examples"]\njsonSamples = random.sample(range(0, len(json_list)), max_count)\nprint(jsonSamples)\nfor json_pos in tqdm(jsonSamples, desc="Going through Colored Object questions"):\n    result = json_list[json_pos]\n    question = result["input"]\n    target = result["target_scores"]\n    curResponse = {"question": question, "target": target}\n    messages = []\n'

In [4]:
for instruct in all_instructions:
    print(instruct)
    print("")

ADD_START[a1;a2;...;an]ADD_END: ADD takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their sum. For example, ADD[5;3;2] would return the sum 5+3+2=10.

SUBTRACT_START[a;b]SUBTRACT_END: SUBTRACT takes in two numbers, a and b, and returns the result of a - b. For example, SUBTRACT_START[10;4]SUBTRACT_END would return 10 - 4 = 6.

MULTIPLY_START[a1;a2;...;an]MULTIPLY_END: MULTIPLY takes in n numbers, a1, a2, ..., an, separated by semicolons, and returns their product. For example, MULTIPLY_START[5;3;2]MULTIPLY_END would return the product 5*3*2=30.

DIVIDE_START[a;b]DIVIDE_END: DIVIDE takes in two numbers, a and b, and returns the result of a / b. For example, DIVIDE_START[10;2]DIVIDE_END would return 10 / 2 = 5.

POWER_START[base;exponent]POWER_END: POWER takes in two numbers, base and exponent, and returns the result of base raised to the power of exponent. For example, POWER_START[2;3]POWER_END would return 2^3 = 8.

SQRT_START[a]SQRT_END: SQRT takes in one num

In [6]:
for code in libraryFunctions:
    print(code[1])
    print("")

ADD

SUBTRACT

MULTIPLY

DIVIDE

POWER

SQRT



In [7]:
ask_prompt = """Whenever you synthesize information and perform reasoning, it is a Thought. When you need to calculate a value, e.g., 5+3, it is an Action. There are """ + str(len(all_instructions)) + " actions:\n"
for i in range(len(all_instructions)):
    ask_prompt += '(' + str(i + 1) + ") "
    ask_prompt += all_instructions[i] 
    ask_prompt += '\n'
ask_prompt += """Terminate your response after an action. Then, an Observation is returned by the user. IMPORTANT: YOU DO NOT WRITE OBSERVATIONS. THEY WILL BE PROVIDED TO YOU. You will be given a question and you will perform Thoughts and Actions until you get the answer. When you get the answer, you will perform the Action called Finish and write Finish[answer]. While you are reasoning out solutions, you will continually refer back to relevant portions of the question and employ logical reasoning to determine your next action."""

In [45]:
json_list = None
max_count = 100
counter = 0
responses = []
with open('datasets/gsm.jsonl', 'r') as json_file:
    json_list = list(json_file)

jsonSamples = random.sample(range(0, len(json_list)), max_count)
#jsonSamples = [15]
print(jsonSamples)

for json_pos in tqdm(range(len(json_list)), desc="Going through math questions"):
    result = json.loads(json_list[json_pos])
    #question = "Toulouse has twice as many sheep as Charleston. Charleston has 4 times as many sheep as Seattle. How many sheep do Toulouse, Charleston, and Seattle have together if Seattle has 20 sheep?"
    question = result["input"]
    target = result["target"]
    curResponse = {"question": question, "target": target}
    finished = False
    messages = [
        {"role": "system", "content": "You are a helpful assistant that accurately solves mathematical reasoning questions by acting as an agent who will perform Thoughts and Actions."},
        {"role": "user", "content": ask_prompt + '\n' + "Question: What is 1 + 1 + 2?"},
        {"role": "assistant", "content": """Thought: I will first calculate 1 + 1 + 2.
Action: ADD_START[1;1;2]ADD_END"""},
        {"role": "user", "content": "Observation: 4"},
        {"role": "assistant", "content": """Thought: Therefore, 1 + 1 + 2 = 4, so my final answer is 4.
Action: FINISH_START[4]FINISH_END"""},
        {"role": "user", "content": ask_prompt + '\n' + "Question: " + question}
    ]
    numIters = 0
    maxIters = 40
    while not finished:
        if numIters >= maxIters:
            print("#################################################")
            print("Maximum number of iterations reached")
            print("Question " + str(counter))
            print("")
            curResponse["received"] = -987654321
            break
        numIters += 1
        gens = interface.generate_history(messages=messages, max_tokens=1024)
        output = gens[0]
        #print("Output:")
        #print(output)
        #print("")
        messages.append({"role": "assistant", "content": output})
        curAction = output[output.find("Action: ") + len("Action: "):]
        if "FINISH" in curAction:
            finished = True
            expression = curAction[curAction.find("FINISH_START[") + len("FINISH_START["):curAction.find("]FINISH_END")]
            try:
                curResponse["received"] = float(eval(expression))
            except:
                print("#################################################")
                print("Finish Error")
                print("Question " + str(counter))
                print(output)
                curResponse["received"] = -987654321
        else:
            evaled = None
            worked = False
            for function in libraryFunctions.keys():
                if function in curAction:
                    try:
                        exec(libraryFunctions[function][1] + '\n' + "evaled = " + function + '("' + curAction + '")')
                        worked = True
                        break
                    except:
                        pass
            if not worked:
                evaled = "That was not a valid input. Please try again."
            evaled = str(evaled)
            messages.append({"role": "user", "content": "Observation: " + evaled})
            #print("Observation: " + evaled)
    curResponse["messages"] = messages
    responses.append(curResponse)
    counter += 1
numCorrect = 0
for i in range(len(responses)):
    response = responses[i]
    #print("Problem: " + str(i))
    #print("Target: " + str(response["target"]))
    #print("Received: " + str(response["received"]))
    #print("")
    if abs(response["received"] - response["target"]) < 1e-6:
        numCorrect += 1
print(numCorrect)

[1216, 1212, 759, 278, 1044, 334, 975, 812, 511, 785, 186, 333, 670, 1177, 1023, 156, 1233, 1031, 377, 1094, 1294, 523, 1245, 1242, 352, 791, 80, 1156, 1030, 8, 426, 183, 316, 1261, 863, 996, 146, 737, 667, 614, 1250, 842, 1127, 542, 872, 905, 587, 97, 118, 771, 419, 931, 315, 158, 403, 395, 293, 571, 1215, 1264, 948, 646, 1123, 1120, 710, 801, 115, 253, 548, 899, 1007, 387, 447, 125, 577, 978, 733, 510, 672, 201, 647, 101, 436, 446, 519, 22, 736, 553, 1074, 907, 95, 1122, 937, 138, 943, 1296, 618, 480, 772, 178]


Going through math questions:   0%|          | 0/1319 [00:00<?, ?it/s]

#################################################
Maximum number of iterations reached
Question 39

#################################################
Maximum number of iterations reached
Question 144

#################################################
Maximum number of iterations reached
Question 184

#################################################
Maximum number of iterations reached
Question 236

#################################################
Maximum number of iterations reached
Question 585

#################################################
Finish Error
Question 830
Thought: The remaining time is 48 minutes. Therefore, Britany spends a total of 18 hours and 48 minutes on TikTok in a month. 

Now, I will finalize the answer.

Action: FINISH_START[18 hours and 48 minutes]FINISH_END
#################################################
Finish Error
Question 906
Thought: Tim has 6 red shoe boxes left. Now I have the final counts: he has 4 blue shoe boxes and 6 red shoe boxes remaining. 