# Learning Code as Policy for Metaworld


In [1]:
import llfbench
import autogen.trace as trace
from autogen.trace.optimizers import FunctionOptimizer
from llfbench.agents.utils import set_seed
from collections import defaultdict
import copy
import pickle

from autogen.trace.bundles import trace_class


class TracedEnv:

    def __init__(self, env_name, seed=0):
        self.env = llfbench.make(env_name)
        self.env.seed(seed)

    @trace.bundle(n_outputs=2)
    def reset(self):
        """
        Reset the environment and return the initial observation and info.
        """
        return self.env.reset()  # obs, info

    @trace.bundle(n_outputs=5)
    def step(self, action):
        """
        Take action in the environment and return the next observation, reward, done, and info.
        """
        return self.env.step(action)


def user_feedback(obs, action, next_obs):
    """
    Provide feedback to the user.
    """
    # return f"Taking action {action.data} at observation {obs['observation'].data} resulted in next observation {next_obs['observation'].data}. Recieved feedback {next_obs['feedback'].data}."
    return next_obs["feedback"].data

In [2]:
### Optimization for single step
def single_step(controller, env, user_feedback, horizon):
    optimizer = trace.optimizers.FunctionOptimizer(controller.parameters())

    # Initialize the environment
    obs, info = env.reset()
    optimizer.objective = f"{optimizer.default_objective} Hint: {obs['instruction']}"

    # Rollout
    sum_of_rewards = 0
    t = 0
    iter = 0
    while t < horizon:
        error = None
        try:
            action = controller(
                obs["observation"].detach()
            )  # Detach; otherwise, it would be back-propagated across time.
            next_obs, reward, termination, truncation, info = env.step(action)
        except trace.TraceExecutionError as e:
            error = e

        if error is None:
            feedback = user_feedback(obs, action, next_obs)  # not traced
            obs = next_obs
            target = next_obs["observation"]
            sum_of_rewards += reward.data  # not traced
            t += 1
            if termination or truncation:
                break
        else:  # Self debugging
            feedback = str(error)
            target = error.exception_node

        # Optimization step
        optimizer.zero_feedback()
        optimizer.backward(target, feedback)  # obs = next obs
        optimizer.step(verbose=True)
        iter += 1

    print("Sum of rewards:", sum_of_rewards)
    print("Success:", info.data["success"])
    print("# of optimization iterations:", iter)
    print("# of time steps:", t)

    return optimizer, sum_of_rewards

In [3]:
# Run experiment


horizon = 40
env_name = "llf-metaworld-pick-place-v2"
env = TracedEnv(env_name, seed=0)

action_space = env.env.action_space


@trace.bundle(trainable=True)
def controller(obs):
    """
    The controller takes in an observation and returns an action.
    """
    return action_space.sample()


single_step(controller, env, user_feedback, horizon)

  logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(


Prompt
 
You're tasked debug and solve a coding/algorithm problem. You will see the code, the documentation of each function used in the code, and the feedback about the code's execution result.

Specifically, a problem will be composed of the following parts:
- #Code: the code whose results you need to improve.
- #Documentation: the documentation of each function used in the code.
- #Variables: the values of the variables that you need to change.
- #Inputs: the values of other inputs to the code
- #Others: the intermediate values created through the code execution.
- #Outputs: the result of the code.
- #Feedback: the feedback about the code's execution result.

In #Variables, #Outputs, and #Others, the format is:
<type> <variable_name> = <value>
You need to change the <value> of the variables in #Variables to improve the code's output in accordance to #Feedback and their data types specified in <type>. If <type> is (code), it means <value> is the source code of a python code, which ma



LLM response:
 {"reasoning": "The feedback suggests that the action [-0.01 0.68 0.12 0. ] is promising, which indicates that the movement vector (the first three components of the action) and the gripper state (the last component of the action) were appropriately calculated for the task at hand. However, to potentially enhance the reward, it is advisable to dynamically compute the 'goal_pos' from the 'goal_pos' field of the input JSON instead of using a hardcoded value. This would allow the controller to adapt to varying goal positions as specified by the input observation. Therefore, the suggested change focuses on extracting the 'goal_pos' from the input JSON and converting it into a numpy array for the computation.", "suggestion": {"__code0": "def controller(obs):\n    import json\n    import numpy as np\n    obs_dict = json.loads(obs)\n    hand_pos = np.array(json.loads(obs_dict['hand_pos'].replace(' ', ',').replace(',,', ',')))\n    goal_pos = np.array(json.loads(obs_dict['goal_po



LLM response:
 {
"reasoning": "The JsonDecodeError is caused by improper formatting of the JSON array strings in the 'hand_pos' and 'goal_pos' fields of the input JSON string. The provided regex operation in the __code0 function definition is intended to transform these space-separated strings into valid, comma-separated JSON arrays, but it seems the execution fails due to a mistake in the regex application or other reasons. A more straightforward approach to replace spaces with commas could be to directly replace space characters with commas without relying on regex, which might not account for all edge cases or could have been misapplied. The suggested code modification directly replaces spaces with commas and ensures no double commas (,,) are created, which might occur in some cases with the original regex approach.",
 "suggestion": {
    "__code0": "def controller(obs):\n    import json\n    import numpy as np\n    obs_dict = json.loads(obs)\n    # Directly replace spaces with comm

(<autogen.trace.optimizers.function_optimizer.FunctionOptimizer at 0x7f542863d580>,
 146.72823307023253)