# Setup

In [2]:
import os
import openai
import requests
 

client = openai.OpenAI(api_key=input())

def llm(prompt, stop=["\n"]):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[{'role' : 'user', 'content' : prompt}],
      temperature=0,
      max_tokens=100,
      top_p=1,
      frequency_penalty=0.0,
      presence_penalty=0.0,
      stop=stop
    )
    return response.choices[0].message.content

In [3]:
%pip install gym

Note: you may need to restart the kernel to use updated packages.


In [4]:
import wikienv, wrappers
env = wikienv.WikiEnv()
env = wrappers.HotPotQAWrapper(env, split="dev")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

# ReAct

In [5]:
import json
import sys

folder = './prompts/'
prompt_file = 'prompts_naive.json'
with open(folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_examples = prompt_dict['webthink_simple6']
instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.
"""
webthink_prompt = instruction + webthink_examples
print(webthink_prompt)
def webthink(idx=None, prompt=webthink_prompt, to_print=True):
    question = env.reset(idx=idx)
    question = 'What did the U.S. use to shoot down the Chinese spy balloon?'
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        thought_action = llm(prompt + f"Thought {i}:", stop=[f"\nObservation {i}:"])
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = llm(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[f"\n"]).strip()
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.

Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
Action 1: Search[Colorado orogeny]
Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
Thought 2: It does not mention the eastern se

In [6]:
import random
import time
from tqdm import tqdm
import json

idxs = list(range(7405))
# random.Random(233).shuffle(idxs)

rs = []
infos = []
old_time = time.time()
ID = 7
for i in tqdm(idxs[ID:ID+1]):
    r, info = webthink(i, to_print=True)
    rs.append(info['em'])
    infos.append(info)
    print(sum(rs), len(rs), sum(rs) / len(rs), (time.time() - old_time) / len(rs))
    print('-----------')
    print()

with open('react_baseline_2.json', 'w') as f:
    json.dump(infos, f)

  0%|          | 0/1 [00:00<?, ?it/s]

7 What did the U.S. use to shoot down the Chinese spy balloon?
Thought 1: I need to search for information on what the U.S. used to shoot down the Chinese spy balloon.
Action 1: Search[U.S. shoot down Chinese spy balloon]
Observation 1: Could not find U.S. shoot down Chinese spy balloon. Similar: ['2023 Chinese balloon incident', '2023 Alaska high-altitude object (redirect from 2023 Alaska shoot down)', 'List of high-altitude object events in 2023 (redirect from 2023 balloon incidents)', '2023 Lake Huron high-altitude object (redirect from Lake Huron shoot down)', 'Chinese intelligence activity abroad'].

Thought 2: I need to search for the specific incident involving the U.S. shooting down a Chinese spy balloon.
Action 2: Search[U.S. shoots down Chinese spy balloon]
Observation 2: Could not find U.S. shoots down Chinese spy balloon. Similar: ['2023 Chinese balloon incident', 'List of high-altitude object events in 2023 (redirect from 2023 balloon incidents)', '2023 Alaska high-altitud

100%|██████████| 1/1 [00:17<00:00, 17.30s/it]

Thought 7: I am unable to find the specific incident involving the U.S. shooting down a Chinese spy balloon. I will need to look for more information or try a different search query to find out what they used. Let me try another search query.
Action 7: Search[U.S. shoots down Chinese spy balloon specific weapon]
Observation 7: Could not find U.S. shoots down Chinese spy balloon specific weapon. Similar: ['2023 Chinese balloon incident', 'China–United States relations', '2016 in aviation', '2022–2023 Pentagon document leaks (redirect from 2023 US classified information leak)', 'History of aerial warfare (section Balloon warfare)'].

OURS: 
CORRECT: 3677 seated
{'steps': 8, 'answer': '', 'gt_answer': '3,677 seated', 'question_idx': 7, 'reward': False, 'em': False, 'f1': 0} 

0 1 0.0 17.3018741607666
-----------




