In [1]:
import requests
import textwrap
from IPython.display import display, Markdown

def long_print(msg: str):
    wrapped_text = textwrap.fill(msg, width=140, replace_whitespace=False)
    print(wrapped_text)

api = 'http://localhost:80'

url = api + '/'
response = requests.get(url)
print(f'{response.status_code=}')
print(f'{response.text=}')

response.status_code=200
response.text='{"statusCode":200,"body":"The app is up!"}'


In [2]:
url = api + '/create_session'
response = requests.get(url)
session_id = response.json()['session_id']
print(f'Session Id: {session_id}')

Session Id: 94be4c13-5d21-4078-8bc1-7e7bea8d427b


In [3]:
import httpx
import os

files = ['https://arxiv.org/pdf/2505.10543']

os.makedirs('./data', exist_ok=True)

for idx, f in enumerate(files):
    file_name = f"./data/{f.split("/")[-1]}.pdf"
    if not os.path.exists(file_name):
        r = httpx.get(f, timeout=20)
        with open(file_name, 'wb') as f:
            f.write(r.content)

In [4]:
url = api + '/upload'
file_path = './data/' + files[0].split("/")[-1] + ".pdf"

with open(file_path, 'rb') as image_file:
    data = {"session_id": session_id}
    files = {"file": (file_path, image_file)}
    response = requests.post(url, data=data, files=files)

if response.status_code == 200:
    print("Image uploaded successfully!")
else:
    print("Error uploading image:", response.status_code)

print(response.json())

Image uploaded successfully!
{'statusCode': 201, 'body': 'File 2505.10543.pdf uploaded.'}


In [5]:
%%time
url = api + '/prepare'

data = {"session_id": session_id, "recreate": False}
response = requests.post(url, json=data)

print(response)

<Response [200]>
CPU times: user 3.88 ms, sys: 2.49 ms, total: 6.37 ms
Wall time: 5.35 s


In [6]:
url = api + '/query'

data = {"session_id": session_id, "query": "Summarize the document."}
response = requests.post(url, json=data)
display(Markdown(response.text.replace("\\n", "<br>")))

"Here’s a concise summary of the document:<br><br>• Goal and scope  <br>  – Evaluates large language models (LLMs) in dynamic environments using prompting techniques.  <br>  – Studies failure modes and investigates enhancements for planning, reasoning, and decision-making tasks.<br><br>• Key findings  <br>  – Larger models generally outperform smaller ones, but strategic prompting (e.g. chain-of-thought) further boosts performance.  <br>  – Advanced reasoning prompts improve results but can introduce instability.  <br>  – LLMs still struggle with tasks requiring multi-step planning, hierarchical reasoning, and long-term strategy.<br><br>• Task modifications and benchmarks  <br>  – Adaptations of classic puzzles (Tower of Hanoi, Messenger) reveal specific failure patterns and areas for improvement.  <br>  – Emphasizes moving beyond static benchmarks to dynamic, interactive tasks that better reflect real-world challenges.<br><br>• Broader context and related work  <br>  – Reviews research on grounded language learning, reinforcement learning, interactive decision making, and symbolic reasoning with LLMs.  <br>  – Highlights the need for novel evaluation frameworks and approaches to address current limitations.<br><br>Overall, while prompting and larger scale help, there remains a significant gap in LLMs’ ability to plan and reason in dynamic, interactive settings."

In [7]:
data = {"session_id": session_id, "query": "Describe the test environment used for the experiments."}
response = requests.post(url, json=data)
display(Markdown(response.text.replace("\\n", "<br>")))

"We evaluated our methods across four very different simulated tasks, each designed to probe a distinct aspect of sequential decision‐making and language understanding:<br><br>1. Bandit  <br>   • A classic multi‐armed bandit setup in which the agent repeatedly chooses among a set of “slot machines” (arms) with unknown reward probabilities.  <br>   • The core challenge is to balance exploration (trying out less familiar arms) versus exploitation (pulling the arm believed to yield the highest payoff).<br><br>2. Rock Paper Scissors  <br>   • A repeated zero‐sum game against an adaptive opponent.  <br>   • Success requires (a) following high‐level strategic instructions, (b) learning the opponent’s play patterns over time, and (c) handling long contextual sequences of past moves to plan your next throw.<br><br>3. Tower of Hanoi  <br>   • A three‐rod, three‐disk puzzle where the goal is to transfer the entire stack from one rod to another.  <br>   • The agent must plan a sequence of moves under the hard constraint that no larger disk may ever sit atop a smaller one.  <br>   • This tests spatial reasoning, hierarchical planning, and constraint‐aware decision making.<br><br>4. Messenger  <br>   • A navigation‐and‐delivery task on a grid with obstacles and (sometimes) moving enemies.  <br>   • Instructions are given in natural language and often involve synonyms (e.g., “exit” vs. “way out,” “foe” vs. “enemy”).  <br>   • The agent must parse these linguistic cues, map them onto actionable moves, avoid hazards, and deliver the message to its goal location.<br><br>Together, these four environments span pure exploration‐exploitation, strategic game play, hierarchical spatial planning, and grounded language understanding."