In [None]:
import requests
import textwrap
from IPython.display import display, Markdown

def long_print(msg: str):
    wrapped_text = textwrap.fill(msg, width=140, replace_whitespace=False)
    print(wrapped_text)

#api = 'http://localhost:80'
api = 'https://3u0mxm98r9.execute-api.us-east-1.amazonaws.com' #Your API endpoint here.

url = api + '/'
response = requests.get(url)
print(f'{response.status_code=}')
print(f'{response.text=}')

response.status_code=200
response.text='{"statusCode":200,"body":"The app is up!"}'


In [16]:
url = api + '/create_session'
response = requests.get(url)
session_id = response.json()['session_id']
print(f'Session Id: {session_id}')

Session Id: 0cac2d73-f7e2-4c37-a884-4ded2278d7a2


In [17]:
import httpx
import os

files = ['https://arxiv.org/pdf/2505.10543']

os.makedirs('./data', exist_ok=True)

for idx, f in enumerate(files):
    file_name = f"./data/{f.split("/")[-1]}.pdf"
    if not os.path.exists(file_name):
        r = httpx.get(f, timeout=20)
        with open(file_name, 'wb') as f:
            f.write(r.content)

In [None]:
url = api + '/upload'
file_path = './data/' + files[0].split("/")[-1] + ".pdf"

with open(file_path, 'rb') as doc_file:
    data = {"session_id": session_id}
    files = {"file": (file_path, doc_file)}
    response = requests.post(url, data=data, files=files)

if response.status_code == 200:
    print("Image uploaded successfully!")
else:
    print("Error uploading image:", response.status_code)

print(response.json())

In [19]:
%%time
url = api + '/prepare'

data = {"session_id": session_id, "recreate": False}
response = requests.post(url, json=data)

print(response)

<Response [200]>
CPU times: total: 15.6 ms
Wall time: 1.99 s


In [20]:
url = api + '/query'

In [None]:
data = {"session_id": session_id, "query": "Summarize the document."}
response = requests.post(url, json=data)
display(Markdown(response.text.replace("\\n", "<br>")))

"Here’s a concise summary of the document:<br><br>1. Purpose  <br>   Researchers evaluated large language models (LLMs) in dynamic, simulated environments to see how well they plan, reason, and execute tasks.<br><br>2. Key Findings  <br>   • Model size helps: Larger models generally outperform smaller ones.  <br>   • Prompting matters: Strategic prompt engineering (e.g., chain-of-thought, tool use, re-prompting) narrows the gap between small and large models.  <br>   • Advanced methods boost performance but can be unstable or brittle.  <br>   • LLMs still struggle with planning, spatial coordination, and multi-step reasoning.  <br><br>3. Research Directions  <br>   • Beyond static benchmarks: Dynamic tasks better reveal reasoning and planning abilities.  <br>   • Grounded language learning: Aligning language with actions and perceptions in environments.  <br>   • Interactive decision making: Agents learn from feedback and adapt.  <br>   • Efficiency vs. capability: Balancing model size, compute cost, and task performance.  <br><br>4. Benchmarks & Domains Covered  <br>   • Robotics and spatial tasks (e.g., navigation in 3D worlds)  <br>   • Reinforcement learning environments  <br>   • Text-based planning and coding tasks  <br>   • Instruction following with human feedback  <br><br>5. Challenges Identified  <br>   • Scalability of reasoning methods  <br>   • Robustness of planning under uncertainty  <br>   • Measuring real-world generalization in simulated settings  <br><br>Overall, while LLMs show promise for grounded decision-making, significant work remains to improve their planning stability, reasoning depth, and real-world applicability."

In [9]:
data = {"session_id": session_id, "query": "Describe the test environment used for the experiments."}
response = requests.post(url, json=data)
display(Markdown(response.text.replace("\\n", "<br>")))

"The paper evaluates prompting strategies across four test environments:<br><br>1. Bandit  <br>   - Standard multi-armed bandit setup.  <br>   - Tasks are simple reward-maximization problems.  <br>   - Complex prompting hurts performance (especially for smaller models) due to “over-thinking.”  <br><br>2. Rock Paper Scissors  <br>   - Repeated plays against an (implicit) opponent.  <br>   - Larger models excel when given planning-style prompts, learning to adapt to opponent patterns.  <br>   - Smaller models succeed occasionally but fail to plan and adjust consistently.  <br><br>3. Tower of Hanoi  <br>   - Classic puzzle: move disks between rods without placing larger on smaller.  <br>   - Tests recursive planning and spatial reasoning.  <br>   - Agents sometimes propose invalid moves. Variants include:  <br>     • Two-disk simplification  <br>     • Providing valid-action hints  <br>     • Reward shaping for incremental progress  <br><br>4. Messenger  <br>   - Grid-world where an agent reads text instructions (with synonyms), avoids enemies, and delivers a message.  <br>   - Success measured by picking up the message, reaching the goal, and not colliding with enemies."