# Learning Code as Policy for Metaworld

This notebook shows a basic example of using the optimizer to follow language feedback online during interaction. We decorate the gym env to make it traceable and then run the optimizer for every time step.

In [1]:
# Run experiment

seed = 0
horizon = 30
env_name = "llf-metaworld-pick-place-v2"
stepsize = 1

In [2]:
import llfbench
import autogen.trace as trace
from autogen.trace.optimizers import FunctionOptimizer
import random
import numpy as np


class TracedEnv:

    def __init__(self, env_name, seed=0):
        random.seed(seed)
        np.random.seed(seed)
        self.env = llfbench.make(env_name)
        self.env.reset(seed=seed)
        self.env.action_space.seed(seed)

    @trace.bundle()
    def reset(self):
        """
        Reset the environment and return the initial observation and info.
        """
        return self.env.reset()  # obs, info

    @trace.bundle()
    def step(self, action):
        """
        Take action in the environment and return the next observation, reward, termination, truncation, and info.
        """
        return self.env.step(action)


def user_feedback(obs, action, next_obs):
    """
    Provide feedback from the user.
    """
    return f"Taking action {action.data} at observation {obs['observation'].data} resulted in next observation {next_obs['observation'].data}. Recieved feedback {next_obs['feedback'].data}."

In [3]:
def expert_run(env, horizon):

    # Initialize the environment
    obs, info = env.reset()

    # Rollout
    sum_of_rewards = 0
    t = 0
    expert_action = None
    while t < horizon:
        action = env.env.action_space.sample() if expert_action is None else expert_action
        next_obs, reward, termination, truncation, info = env.step(action)
        expert_action = env.env.expert_action

        sum_of_rewards += reward.data  # not traced
        t += 1
        if termination or truncation or info.data["success"]:
            break

    print("Sum of rewards:", sum_of_rewards)
    print("Success:", info.data["success"])
    print("Termination:", termination.data)
    print("Truncation:", truncation.data)
    print("# of time steps:", t)

    return sum_of_rewards, info.data["success"], termination.data, truncation.data


env = TracedEnv(env_name, seed=seed)
sum_of_rewards, success, termination, truncation = expert_run(env, horizon=horizon)

  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  gym.logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(


Sum of rewards: 16.13824992438523
Success: True
Termination: False
Truncation: False
# of time steps: 7


In [4]:
def single_step(env, horizon, user_feedback, controller, optimizer, max_iter=None):
    """Run optimizer step for every time step."""

    max_iter = max_iter or horizon * 2

    # Initialize the environment
    obs, info = env.reset()

    # Rollout
    sum_of_rewards = 0
    t = 0
    i = 0
    while t < horizon and i < max_iter:
        error = None
        try:
            # Detach; otherwise, it would be back-propagated across time.
            action = controller(obs["observation"].detach())
            next_obs, reward, termination, truncation, info = env.step(action)
        except trace.TraceExecutionError as e:
            error = e

        if error is None:
            feedback = user_feedback(obs, action, next_obs)  # not traced
            obs = next_obs
            target = next_obs["observation"]
            # Log
            sum_of_rewards += reward.data  # not traced
            t += 1  # time step
            if termination or truncation or info.data["success"]:
                break
        else:  # Self debugging
            feedback = str(error)
            target = error.exception_node

        # Optimization step
        optimizer.zero_feedback()
        optimizer.backward(target, feedback)  # obs = next obs
        optimizer.step(verbose="output")
        i += 1  # optimization iteration

        print(f"Time Step: {t} of {horizon}")
        print(f"Iteration: {i}")
        print(f"Feedback: {feedback}")
        print(f"Variable:\n {controller.parameter.data}")

    print("Sum of rewards:", sum_of_rewards)
    print("Termination:", termination.data)
    print("Truncation:", truncation.data)
    print("Success:", info.data["success"])
    print("# of optimization iterations:", i)
    print("# of time steps:", t)

    return sum_of_rewards, info.data["success"], termination.data, truncation.data, optimizer


env = TracedEnv(env_name, seed=seed)


action_space = env.env.action_space

# Declare the controller to be trainable


@trace.bundle(trainable=True)
def controller(obs):
    """
    The controller takes in an observation and returns an action.
    """
    return action_space.sample()


# Create an optimizer
optimizer = trace.optimizers.FunctionOptimizer(controller.parameters(), stepsize=stepsize)
sum_of_rewards, success, termination, truncation, optimizer = single_step(
    env, horizon=horizon, controller=controller, user_feedback=user_feedback, optimizer=optimizer
)

LLM response:
 {
"reasoning": "The feedback indicates that the action taken by the controller function resulted in a negative outcome, i.e., moving the arm in the wrong direction which made achieving the goal harder. The eval function is executing the controller function, which currently only samples a random action. To improve the output, the controller function should be changed to take into account the feedback provided, suggesting a more goal-oriented action. Specifically, the function needs to be updated to generate actions that move closer to the suggested coordinates [-0.07  0.68  0.12  0.]. Therefore, instead of sampling a random action, we need to redefine the controller to return a specific action that aligns more with the feedback.",
"answer": "",
"suggestion": {
"__code0": "def controller(obs):\n    \"\"\"\n    The controller takes in an observation and returns an action.\n    Now adjusted to move towards a specified goal.\n    \"\"\"\n    # Instead of random sampling, retu