# Setup

In [23]:
import os

In [24]:
import os
import openai
import requests
from openai import OpenAI
openai.api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def llm(prompt, stop=["\n"]):
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {
            "role": "user",
            "content": prompt,
        }
      ],
      temperature=0,
      max_tokens=100,
      top_p=1,
      frequency_penalty=0.0,
      presence_penalty=0.0,
      stop=stop
    )
    return response.choices[0].message.content

def llm_first_call(prompt,image, stop=["\n"]):
  response = client.chat.completions.create(
    model="gpt-4o",
    messages= [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": f'{prompt}'
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{image}"
            }
          }
        ]
      }
    ],
    temperature=0,
    max_tokens=100,
    top_p=1,
    frequency_penalty=0.0,
    presence_penalty=0.0,
    stop=["\n"]
  )
  return response.choices[0].message.content

In [25]:
import wikienv, wrappers
env = wikienv.WikiEnv()
env = wrappers.HotPotQAWrapperImage(env, split="repurposed")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

In [26]:
import json
import base64

folder = ''
file_name = 'aokvqa_repurposed.json'
with open(folder + file_name, 'r') as f:
    data = json.load(f)

first_entry = data[2]
question = first_entry['question']
image = first_entry['context']
try:
    base64.b64decode(image)
except base64.binascii.Error:
    print("Invalid base64 encoded image")
    exit()
prompt = f"Question: {question}"

response = client.chat.completions.create(
  model="gpt-4o",
  messages= [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": f'{prompt}'
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{image}"
          }
        }
      ]
    }
  ],
  temperature=0,
  max_tokens=100,
  top_p=1,
  frequency_penalty=0.0,
  presence_penalty=0.0,
  stop=["\n"]
)

answer = response.choices[0].message.content
print(answer)


FileNotFoundError: [Errno 2] No such file or directory: 'aokvqa_repurposed.json'

# ReAct

In [27]:
import json
import sys

folder = './prompts/'
prompt_file = 'prompts_naive.json'
with open(folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_examples = prompt_dict['webthink_simple_image']
instruction = """Solve a question answering task, by looking at the given image, doing an ImageObserve, with interleaving Thought, Action, Observation steps. Use the provided image as context required to solve the question. Thought can reason about the current situation, and Action can be three types: 
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.
"""
webthink_prompt = instruction + webthink_examples

def webthink(idx=None, prompt=webthink_prompt, to_print=True):
    (question, info) = env.reset(idx=idx, return_info=True)
    image = info['image']
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        if i==1:
            thought_action = llm_first_call(prompt + f"Thought {i}:", image=image, stop=[f"\nObservation {i}:"])
        else:
            thought_action = llm(prompt + f"Thought {i}:", stop=[f"\nObservation {i}:"])
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = llm(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[f"\n"]).strip()
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")

    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [28]:
import random
import time
idxs = list(range(30))
random.Random(233).shuffle(idxs)

rs = []
infos = []
old_time = time.time()
for i in idxs[:10]:
    r, info = webthink(i, to_print=True)
    rs.append(info['em'])
    infos.append(info)
    print(sum(rs), len(rs), sum(rs) / len(rs), (time.time() - old_time) / len(rs))
    print('-----------')
    print()

13 Question: What animals are these?
ohh... The image shows a group of horses grazing on a hillside.
Thought 1: The image shows a group of horses grazing on a hillside.
Action 1: Finish[horses]
Observation 1: Episode finished, reward = 0


0 1 0.0 2.2608985900878906
-----------

11 Question: In which country is this bus located?
ohh... The bus in the image is a red double-decker bus with the destination "Gallions Reach" and route number "101." This type of bus is commonly associated with the United Kingdom, specifically London.
Thought 1: The bus in the image is a red double-decker bus with the destination "Gallions Reach" and route number "101." This type of bus is commonly associated with the United Kingdom, specifically London.
Action 1: Search[Gallions Reach]
Observation 1: Gallions Reach is a stretch of the River Thames between Woolwich and Thamesmead.[1] The area is named for the Galyons, a 14th-century family who owned property along this stretch of the river,[2] and places, inc

KeyboardInterrupt: 