> NOTE: **The point of this notebook is to show a negligble 'n_badcalls' value (where the model outputs something unparseable) at HotPotQA.**
> **With proper web-search tools, and proper formatting, there'd be a much higher success rate than what you'd get with this notebook. For comparison purposes, I've done my best to maintain the code from the original authors.**

In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [3]:
import os
import openai
from grammarflow import *
from pydantic import BaseModel, Field
import random
import time

# Setup

In [5]:
class LLM:
    def __init__(self):
        self.client = openai.OpenAI(
            api_key=os.environ["OPENAI_API_KEY"],
        )

    def invoke(self, config: dict):
        with PromptContextManager(config) as filled_prompt:
            return self.request(filled_prompt, temperature=0.01)

    def __call__(self, prompt, temperature=0.2, context=None):
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
        )
        return response.choices[0].message.content


llm = LLM()

In [6]:
import wikienv, wrappers
env = wikienv.WikiEnv()
env = wrappers.HotPotQAWrapper(env, split="dev")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

# Using GrammarFlow

> HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems.

[ReAct](https://github.com/ysymyth/ReAct) adds a bunch of useful helper functions to their evaluation code for prompting against this dataset. This sample directory cleans up their code and adds GrammarFlow to it. 

All we need to do is: 
1. Create a Step model to constrain our thinking state
2. Create a Prompt for our ReAct flow. Our `make_prompt()` function takes care of it. 
3. Within the original `webthink()` function, we make changes along the lines of: 
  - Tracking and logging all steps between iterations
  - Use `Constrain()` from GrammarFlow to make the extraction of 'Thought', 'Action' and 'Action_Input' pythonic and error-prone. 

Note: Remember that prompts are stateful, and thus need to be discarded after every `Constrain()` block. That's why we use `make_prompt()`

Our goal was to show that we can easily infuse existing prompting code with GrammarFlow! 

# GrammarFlow Setup 

In [7]:
class Step(BaseModel):
    thought: str = Field(..., description="Concisely describe your thought process in your current thinking step. Use your previous step's output to guide your current step.")
    action: str = Field(..., description="You only have 3 options: search | lookup | finish")
    action_input: str = Field(..., description="Your input to the above action.")

In [8]:
def check_previous_interaction(id_): return id_ > 1

def make_prompt(): 
  prompt = PromptBuilder() 
  prompt.add_section(
    text="""
  {question}

  Your goal is to solve the above QA task in steps. First, think about what information you need to answer the question. You have access to tools as defined below:
  1. `search`: to get the information you need from WIKIPEDIA. This is not a search engine. This needs a single keyword or noun as input. 
  2. `lookup`: to find more information from the paragraph returned by search. This matches the action_input you give to setences in search. 
  3. `finish`: to return your final complete answer as input field. Use this if you believe you have enough information to completely answer the task.""", 
    placeholders=["question"]
  ) 
  prompt.add_section(
    define_grammar=True
  ) 
  prompt.add_section(
    text="Use the below Similar: [] keywords for search. Context: \n{history}",
    placeholders=["history"],
    enable_on=check_previous_interaction
  ) 
  prompt.add_section(
    text="Create the next [Step] {id_} using the information available above: ", 
    placeholders=["id_"]
  )
  return prompt 

In [9]:
example = wrappers.EXAMPLE
question = "" # Will be defined in later blocks. Re-using ReAct's example for now.

# Helper functions 
def load_history(history: dict): 
  return "\n".join([f"""thought {id_}: {value['thought']}\naction {id_}: {value['action']}\naction_input {id_}: {value['action_input']}\nObservation {id_}: {value['observation']}"""
   for id_, value in history.items()])

def log_step(response, observation, id_, to_print=True): 
  if to_print:
    print('--------------------------')
    print('Step {}'.format(id_))
    print('Thought:', response.Step.thought)
    print('Action:', response.Step.action)
    print('Action Input:', response.Step.action_input)
    print('Observation:', observation)
    print('--------------------------')

In [10]:
def webthink(idx=None, env=env, to_print=False): 
  # Initializing Stateful vars
  thought = None 
  action = None
  observation = None
  id_ = 1
  history, history_ = {}, None
  n_calls, n_badcalls = 0, 0

  # Initializing question
  question = env.reset(idx=idx)
  print(question)

  for i in range(7):
    if history:
      history_ = load_history(history) # Initializes history every run 

    # The only major change we've made! 
    with Constrain(make_prompt()) as manager: 
      manager.set_config(
        format='toml'
      ) 
      manager.format_prompt(
        placeholders={ 
          "question": question,
          'example': example, 
          "history": history_
        }, 
        grammars=[{
          'description': 'Your thinking state:', 
          'model': Step
        }], 
        enable_on={
          'id_':id_ 
        }
      ) 
      
      prompt = manager.prompt
      response = llm(prompt, temperature=0.01)
      n_calls += 1

      try: 
        response = manager.parse(response)
        observation, r, done, info = step(env, f"{response.Step.action}[{response.Step.action_input}]")
        observation = observation.replace("\\n", " ")
      except Exception as e:
        n_badcalls += 1
        observation, done = "Error in parsing. Follow the [Step] format and try again.", False 

    log_step(response, observation, id_, to_print=to_print)
    
    history[id_] = { 
      "thought": response.Step.thought, 
      "action": response.Step.action, 
      "action_input": response.Step.action_input, 
      "observation": observation
    }

    print(id_)
    id_ += 1

    if done: 
      break 

  if not done:
      final = "Failed"
      observation, r, done, info = step(env, "finish[]")

  info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'steps': id_})

  return r, info

In [12]:
idxs = list(range(7409))
random.Random(221).shuffle(idxs)

rs = []
infos = [] 
bad_calls = 0
old_time = time.time()
for i in idxs[:4]:
    r, info = webthink(i, to_print=False)
    rs.append(info['em'])
    bad_calls += info['n_badcalls']
    infos.append(info)
    print('RESULTS: ', sum(rs), len(rs), sum(rs) / len(rs), (time.time() - old_time) / len(rs))
    print('--------------------------')
    print()

print('Bad calls: ', bad_calls) 
print(infos)

# 👍