# Learning Code as Policy for Metaworld


In [1]:
import llfbench
import autogen.trace as trace
from autogen.trace.optimizers import FunctionOptimizer
from llfbench.agents.utils import set_seed


# Config
horizon = 20
seed = 0
env_name = "llf-metaworld-pick-place-v2"

set_seed(seed)
env = llfbench.make(env_name)
env.seed(seed)


@trace.trace_op(n_outputs=2)
def reset():
    """
    Reset the environment and return the initial observation and info.
    """
    obs, info = env.reset()
    return obs, info


@trace.trace_op(n_outputs=5)
def step(action):
    """
    Take action in the environment and return the next observation, reward, done, and info.
    """
    next_obs, reward, termination, truncation, info = env.step(action)
    return next_obs, reward, termination, truncation, info


@trace.trace_op(trainable=True)
def controller(obs):
    """
    The controller takes in an observation and returns an action.
    """
    return env.action_space.sample()


# TODO controller parameters bug
controller.parameters()
optimizer = trace.optimizers.FunctionOptimizer([controller.parameter])


old_objective = optimizer.objective


sum_of_rewards = 0
obs, info = reset()

objective = f"{old_objective} Hint: {obs['instruction']}"


for _ in range(horizon):
    obs = obs.detach()  # Need a new node; otherwise, it would be backward multiple times.
    try:
        action = controller(obs)
        next_obs, reward, termination, truncation, info = step(action)
        assert next_obs["instruction"].data is None  # Make sure the instruction is not changed.
        feedback = f"Taking action {action.data} at observation {obs['observation'].data} gave reward {reward.data} and next observation {next_obs['observation'].data}. Recieved feedback {next_obs['feedback'].data}."
        obs = next_obs
    except trace.TraceExecutionError as e:
        feedback = str(e)

    optimizer.zero_feedback()

    optimizer.backward(obs, feedback, retain_graph=True)  # obs = next obs
    optimizer.step(verbose=True)
    done = termination or truncation
    sum_of_rewards = reward + sum_of_rewards

    if done:
        break

  logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


Prompt
 
You're tasked debug and solve a coding/algorithm problem. You will see the code, the documentation of each function used in the code, and the feedback about the code's execution result.

Specifically, a problem will be composed of the following parts:
- #Code: the code whose results you need to improve.
- #Documentation: the documentation of each function used in the code.
- #Variables: the values of the variables that you need to change.
- #Inputs: the values of other inputs to the code
- #Others: the intermediate values created through the code execution.
- #Outputs: the result of the code.
- #Feedback: the feedback about the code's execution result.

In #Variables, #Outputs, and #Others, the format is:
<type> <variable_name> = <value>
You need to change the <value> of the variables in #Variables to improve the code's output in accordance to #Feedback and their data types specified in <type>. If <type> is (code), it means <value> is the source code of a python code, which ma



LLM response:
 {"reasoning": "The provided code and the action proposed by __code0 seem to be aligned with obtaining a positive feedback in the simulation—suggesting the robot to move towards a specific pose with the given action [0.08, 0.67, 0.12, 0.0]. However, the given feedback indicates that even though the robot has moved closer towards the goal, the movement wasn't sufficient as evidenced by the reward of 0.0028221216793436984. This suggests that the action directed the robot towards the goal, but possibly not in the most efficient manner. Since the feedback specifically repeats the advice to take action [0.08, 0.67, 0.12, 0.0], which matches the action already taken, it seems there's no need to alter the action proposed by __code0. The feedback does not indicate a problem with the action itself but rather seems to confirm the chosen direction as correct. Therefore, there appears to be no necessary change to the variables or the code, as the feedback, despite acknowledging progr

In [2]:
print(f"Total reward: {sum_of_rewards.data}")
print(f"Success: {info['success'].data}")

Total reward: 146.6986957601801
Success: True
