In [14]:
import requests
import textwrap
from IPython.display import display, Markdown

def long_print(msg: str):
    wrapped_text = textwrap.fill(msg, width=140, replace_whitespace=False)
    print(wrapped_text)

#api = 'http://localhost:80'
api = 'https://6mwjc1d8be.execute-api.us-east-1.amazonaws.com'

url = api + '/'
response = requests.get(url)
print(f'{response.status_code=}')
print(f'{response.text=}')

response.status_code=200
response.text='{"statusCode":200,"body":"The app is up!"}'


In [15]:
url = api + '/create_session'
response = requests.get(url)
session_id = response.json()['session_id']
print(f'Session Id: {session_id}')

Session Id: c8569d8f-8b0a-4865-85ec-5fa02183f2e8


In [16]:
import httpx
import os

files = ['https://arxiv.org/pdf/2505.10543']

os.makedirs('./data', exist_ok=True)

for idx, f in enumerate(files):
    file_name = f"./data/{f.split("/")[-1]}.pdf"
    if not os.path.exists(file_name):
        r = httpx.get(f, timeout=20)
        with open(file_name, 'wb') as f:
            f.write(r.content)

In [17]:
url = api + '/upload'
file_path = './data/' + files[0].split("/")[-1] + ".pdf"

with open(file_path, 'rb') as image_file:
    data = {"session_id": session_id}
    files = {"file": (file_path, image_file)}
    response = requests.post(url, data=data, files=files)

if response.status_code == 200:
    print("Image uploaded successfully!")
else:
    print("Error uploading image:", response.status_code)

print(response.json())

Image uploaded successfully!
{'statusCode': 201, 'body': 'File 2505.10543.pdf uploaded.'}


In [18]:
%%time
url = api + '/prepare'

data = {"session_id": session_id, "recreate": False}
response = requests.post(url, json=data)

print(response)

<Response [200]>
CPU times: total: 0 ns
Wall time: 3.77 s


In [19]:
url = api + '/query'

data = {"session_id": session_id, "query": "Summarize the document."}
response = requests.post(url, json=data)
display(Markdown(response.text.replace("\\n", "<br>")))

"Here’s a concise summary of the document:<br><br>1. Purpose  <br>   Researchers evaluated large language models (LLMs) in dynamic, simulated environments to see how well they plan, reason, and execute tasks.<br><br>2. Key Findings  <br>   • Model size helps: Larger models generally outperform smaller ones.  <br>   • Prompting matters: Strategic prompt engineering (e.g., chain-of-thought, tool use, re-prompting) narrows the gap between small and large models.  <br>   • Advanced methods boost performance but can be unstable or brittle.  <br>   • LLMs still struggle with planning, spatial coordination, and multi-step reasoning.  <br><br>3. Research Directions  <br>   • Beyond static benchmarks: Dynamic tasks better reveal reasoning and planning abilities.  <br>   • Grounded language learning: Aligning language with actions and perceptions in environments.  <br>   • Interactive decision making: Agents learn from feedback and adapt.  <br>   • Efficiency vs. capability: Balancing model size, compute cost, and task performance.  <br><br>4. Benchmarks & Domains Covered  <br>   • Robotics and spatial tasks (e.g., navigation in 3D worlds)  <br>   • Reinforcement learning environments  <br>   • Text-based planning and coding tasks  <br>   • Instruction following with human feedback  <br><br>5. Challenges Identified  <br>   • Scalability of reasoning methods  <br>   • Robustness of planning under uncertainty  <br>   • Measuring real-world generalization in simulated settings  <br><br>Overall, while LLMs show promise for grounded decision-making, significant work remains to improve their planning stability, reasoning depth, and real-world applicability."

In [20]:
data = {"session_id": session_id, "query": "Describe the test environment used for the experiments."}
response = requests.post(url, json=data)
display(Markdown(response.text.replace("\\n", "<br>")))

"The experiments were run in four prototypical testbeds, each designed to probe different aspects of sequential decision-making and generalization:<br><br>1. Bandit  <br>   – A classic N-armed bandit problem in which at each round the agent selects one of several “slot‐machine” arms.  <br>   – Each arm yields stochastic payoffs drawn from an unknown distribution.  <br>   – The agent’s goal is to maximize cumulative reward by balancing exploration (trying less-pulled arms) against exploitation (choosing the empirically best arm).<br><br>2. Rock Paper Scissors  <br>   – A two-player zero-sum game where the agent repeatedly chooses Rock, Paper, or Scissors.  <br>   – Payoffs are determined by the standard win/tie/lose payoff matrix.  <br>   – Serves as a simple adversarial setting to study equilibrium learning and adaptation to an opponent’s strategy.<br><br>3. Tower of Hanoi  <br>   – A puzzle environment with three pegs and a stack of disks of varying sizes.  <br>   – The agent moves one disk at a time, never placing a larger disk atop a smaller one, with the objective of transferring the entire stack from the start peg to a target peg.  <br>   – Tests the ability to plan multi‐step, constraint‐satisfying sequences of moves.<br><br>4. Messenger  <br>   – A simulated communication/interaction environment in which the agent must send, receive, or route messages to accomplish specified tasks.  <br>   – Though details vary by experiment, it generally examines how agents learn protocols and handle information exchange under uncertainty."