In [1]:
import datetime
from functools import partial
import os

from absl import app, flags, logging
import flax
from flax.traverse_util import flatten_dict
import jax
from jax.sharding import Mesh, NamedSharding, PartitionSpec
from ml_collections import config_flags, ConfigDict
import optax
import tensorflow as tf
import tqdm
import wandb
import sys
from pathlib import Path

sys.path.append("../..")
sys.path.append("/ubc/cs/research/nlp/grigorii/projects/openvla/")

from octo.data.dataset import make_single_dataset
from octo.model.octo_model import OctoModel
from octo.utils.jax_utils import initialize_compilation_cache
from octo.utils.spec import ModuleSpec
from octo.utils.train_callbacks import (
    RolloutVisualizationCallback,
    SaveCallback,
    ValidationCallback,
    VisualizationCallback,
)

from openvla.experiments.robot.calvin.calvin_utils import get_calvin_env



from octo.utils.train_utils import (
    check_config_diff,
    create_optimizer,
    format_name_with_config,
    merge_params,
    process_text,
    Timer,
    TrainState,
)

try:
    from jax_smi import initialise_tracking  # type: ignore

    initialise_tracking()
except ImportError:
    pass


2025-02-11 07:18:43.592019: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-11 07:18:43.592085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-11 07:18:43.593875: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  from .autonotebook import tqdm as notebook_tqdm
pybullet build time: Jan 29 2025 23:16:28


### Low-level actions one-by-one

In [2]:
from experiments.robot.calvin.multistep_sequences_low_level import get_low_level_sequences
from torchvision.transforms import Resize
from itertools import product
import numpy as np
from experiments.robot.calvin.calvin_utils import get_calvin_env, add_text
from experiments.robot.calvin.rollout_video import RolloutVideo
import hydra
from collections import defaultdict
from tqdm import tqdm
from experiments.robot.calvin.calvin_utils import get_calvin_env, resize_image, get_video_tag, count_success, get_env_state_for_initial_condition
import torch
import json
from experiments.robot.robot_utils import (
    DATE_TIME,
    get_action,
    get_image_resize_size,
    get_model,
    invert_gripper_action,
    normalize_gripper_action,
    set_seed_everywhere,
)
import logging
logger = logging.getLogger(__name__)


In [3]:
checkpoint_path = '/ubc/cs/research/ubc_ml/gguz/exp_data/octo_finetune/experiment_2_20250207_133938_small_2'
video_save_dir = Path("/ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/") / Path(checkpoint_path).name
video_save_dir.mkdir(exist_ok=True)
model = OctoModel.load_pretrained(checkpoint_path)
processor = None

with open(Path(checkpoint_path) / 'finetune_config.json', 'r') as file:
    finetune_config = json.load(file)


In [4]:
config_path='/ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/conf/med_tasks_config.yaml'
env, calvin_cfg = get_calvin_env(
    config_path,
    device_id=0,
)
calvin_cfg.image_obs_keys = finetune_config['dataset_kwargs']['image_obs_keys']


GETTING THE DEVICE ID!*****************************************
RESULT OF EGL OPTIONS EGL device choice: -1 of 10.

RESULT INSIDE LOOP ***********************, id is  0
Stdout:  Starting EGL query
Loaded EGL 1.5 after reload.
GL_VENDOR=NVIDIA Corporation
CUDA_DEVICE=0
GL_RENDERER=NVIDIA RTX A5000/PCIe/SSE2
GL_VERSION=3.3.0 NVIDIA 550.120
GL_SHADING_LANGUAGE_VERSION=3.30 NVIDIA via Cg compiler
Completeing EGL query

Stderr:  EGL device choice: 0 of 10 (from EGL_VISIBLE_DEVICE)

The match:  <re.Match object; span=(77, 90), match='CUDA_DEVICE=0'>


EGL device choice: 0 of 10 (from EGL_VISIBLE_DEVICES)


received depth0


In [5]:
from functools import partial
from typing import Dict, Set

import numpy as np
from omegaconf import ListConfig
from scipy.spatial.transform import Rotation as R

REL_EPS = 0.03
EPSILON_ROTATION_DEGREES = 10
EPSILON_SLIDER_DRAWER =  0.05
EPSILON_BLOCK_POS = 0.1

class LowLevelTasks:
    def __init__(self, tasks):
        """
        A task is defined as a specific change between the start_info and end_info dictionaries.
        Use config file in conf/tasks/ to define tasks using the base task functions defined in this class
        """

        # register task functions from config file
        self.tasks = {name: partial(getattr(self, args[0]), *args[1:]) for name, args in dict(tasks).items()}
        # dictionary mapping from task name to task id
        self.task_to_id = {name: i for i, name in enumerate(self.tasks.keys())}
        # dictionary mapping from task id to task name
        self.id_to_task = {i: name for i, name in enumerate(self.tasks.keys())}


    def get_task_info(self, start_info: Dict, end_info: Dict) -> Set:
        """
        start_info: dict with scene info and robot info
        end_info: dict with scene info and robot info
        returns set with achieved tasks
        """
        # call functions that are registered in self.tasks
        return {
            task_name
            for task_name, function in self.tasks.items()
            if function(start_info=start_info, end_info=end_info)
        }

    def get_task_info_for_set(self, start_info: Dict, end_info: Dict, task_filter: Set) -> Set:
        """
        start_info: dict with scene info and robot info
        end_info: dict with scene info and robot info
        task_filter: set with task names to check
        returns set with achieved tasks
        """
        # call functions that are registered in self.tasks
        return {
            task_name
            for task_name, function in self.tasks.items()
            if task_name in task_filter and function(start_info=start_info, end_info=end_info)
        }

    @staticmethod
    def stack_objects(max_vel=1, start_info=None, end_info=None):
        obj_uids = set(obj["uid"] for obj in start_info["scene_info"]["movable_objects"].values())

        for obj_name in start_info["scene_info"]["movable_objects"]:
            obj_start_info = start_info["scene_info"]["movable_objects"][obj_name]
            obj_end_info = end_info["scene_info"]["movable_objects"][obj_name]
            obj_start_contacts = set(c[2] for c in obj_start_info["contacts"])
            obj_end_contacts = set(c[2] for c in obj_end_info["contacts"])

            if (
                not len(obj_uids & obj_start_contacts)
                and len(obj_uids & obj_end_contacts)
                and not len(obj_end_contacts - obj_uids)
            ):
                # object velocity may not exceed max_vel for successful stack
                if np.all(np.abs(obj_end_info["current_lin_vel"]) < max_vel) and np.all(
                    np.abs(obj_end_info["current_ang_vel"]) < max_vel
                ):
                    return True
        return False


    @property
    def num_tasks(self):
        return len(self.tasks)

    @staticmethod
    def rotate_object(
        obj_names, z_degrees, x_y_threshold=30, z_threshold=180, movement_threshold=0.1, start_info=None, end_info=None
    ):
        """
        Returns True if the object with obj_name was rotated more than z_degrees degrees around the z-axis while not
        being rotated more than x_y_threshold degrees around the x or y axis.
        z_degrees is negative for clockwise rotations and positive for counter-clockwise rotations.
        """
        found_grasped = False
        for name in obj_names:
            if is_block_grasped(obj_name=name, state_info=start_info):
                obj_name = name
                found_grasped = True
                break
        
        if not found_grasped:
            return False
        
        obj_start_info = start_info["scene_info"]["movable_objects"][obj_name]
        obj_end_info = end_info["scene_info"]["movable_objects"][obj_name]
        start_orn = R.from_quat(obj_start_info["current_orn"])
        end_orn = R.from_quat(obj_end_info["current_orn"])
        rotation = end_orn * start_orn.inv()
        x, y, z = rotation.as_euler("xyz", degrees=True)

        start_pos = np.array(obj_start_info["current_pos"])
        end_pos = np.array(obj_end_info["current_pos"])
        pos_diff = end_pos - start_pos
        if np.linalg.norm(pos_diff) > movement_threshold:
            return False

        end_contacts = set(c[2] for c in obj_end_info["contacts"])
        robot_uid = {start_info["robot_info"]["uid"]}

        # object should be in contact with ground
        if len(end_contacts - robot_uid) > 0:
            return False

        if z_degrees > 0:
            return z_degrees < z < z_threshold and abs(x) < x_y_threshold and abs(y) < x_y_threshold
        else:
            return z_degrees > z > -z_threshold and abs(x) < x_y_threshold and abs(y) < x_y_threshold

    @staticmethod
    def push_object(x_direction, y_direction, start_info, end_info):
        """
        Returns True if the object with 'obj_name' was moved more than 'x_direction' meters in x direction
        (or 'y_direction' meters in y direction analogously).
        Note that currently x and y pushes are mutually exclusive, meaning that one of the arguments has to be 0.
        The sign matters, e.g. pushing an object to the right when facing the table coincides with a movement in
        positive x-direction.
        """
        assert x_direction * y_direction == 0 and x_direction + y_direction != 0
        # Find the block which is in contact with the gripper
        found_block = False
        robot_uid = start_info["robot_info"]["uid"]

        for block_name, block_info in start_info["scene_info"]["movable_objects"].items():
            for c in block_info['contacts']:
                if c[2] == robot_uid:
                    obj_name = block_name
                    found_block = True
                    break
        
        if not found_block:
            return False

        obj_start_info = start_info["scene_info"]["movable_objects"][obj_name]
        obj_end_info = end_info["scene_info"]["movable_objects"][obj_name]
        start_pos = np.array(obj_start_info["current_pos"])
        end_pos = np.array(obj_end_info["current_pos"])
        pos_diff = end_pos - start_pos

        # contacts excluding robot
        start_contacts = set((c[2], c[4]) for c in obj_start_info["contacts"] if c[2] != robot_uid)
        end_contacts = set((c[2], c[4]) for c in obj_end_info["contacts"] if c[2] != robot_uid)

        # computing set difference to check if object had surface contact (excluding robot) at both times
        surface_contact = len(start_contacts) > 0 and len(end_contacts) > 0 and start_contacts <= end_contacts
        if not surface_contact:
            return False

        if x_direction > 0:
            return pos_diff[0] > x_direction
        elif x_direction < 0:
            return pos_diff[0] < x_direction

        if y_direction > 0:
            return pos_diff[1] > y_direction
        elif y_direction < 0:
            return pos_diff[1] < y_direction

    @staticmethod
    def lift_grasped_object(surface_bodies=None, start_info=None, end_info=None):
        """
        Returns True if the object with 'obj_name' was grasped by the robot and lifted more than 'z_direction' meters.
        """
        found_grasped = False
        for name in ['block_red', 'block_blue', 'block_pink']:
            if is_block_grasped(obj_name=name, state_info=start_info):
                obj_name = name
                found_grasped = True
                break
        
        if not found_grasped:
            return False
        
        obj_start_info = start_info["scene_info"]["movable_objects"][obj_name]
        obj_end_info = end_info["scene_info"]["movable_objects"][obj_name]

        start_pos = np.array(obj_start_info["current_pos"])
        end_pos = np.array(obj_end_info["current_pos"])
        pos_diff = end_pos - start_pos
        z_diff = pos_diff[2]

        robot_uid = start_info["robot_info"]["uid"]
        end_contacts = set(c[2] for c in obj_end_info["contacts"])

        #print(start_info["scene_info"]["fixed_objects"]['table']["links"])
        #print(start_info["scene_info"]["movable_objects"]['block_red'])


        for surface_link in ['base_link', 'plank_link', 'drawer_link']:
            surface_uid = start_info["scene_info"]["fixed_objects"]['table']["uid"]
            surface_link_id = start_info["scene_info"]["fixed_objects"]['table']["links"][surface_link]
            start_contacts_links = set((c[2], c[4]) for c in obj_start_info["contacts"])
            end_contacts_links = set((c[2], c[4]) for c in obj_end_info["contacts"])
            surface_criterion = (surface_uid, surface_link_id) in start_contacts_links and (surface_uid, surface_link_id) not in end_contacts_links

            if not surface_criterion:
                continue

            start_pos = np.array(obj_start_info["current_pos"])
            end_pos = np.array(obj_end_info["current_pos"])
            pos_diff = np.abs(end_pos[:2] - start_pos[:2]).sum()

            start_orn = R.from_quat(obj_start_info["current_orn"])
            end_orn = R.from_quat(obj_end_info["current_orn"])
            rotation = end_orn * start_orn.inv()
            x, y, z = rotation.as_euler("xyz", degrees=True)

            #print(f"Pos diff {pos_diff} z_degrees {z} z_diff {z_diff}")

            if pos_diff > EPSILON_BLOCK_POS:
                return False

            z_degrees = EPSILON_ROTATION_DEGREES
            if z > z_degrees:
                return False

            z_degrees = -EPSILON_ROTATION_DEGREES
            if z < z_degrees:
                return False

            z_direction = 0.03
            if (
                # robot still holding the object
                z_diff > z_direction
                and robot_uid in end_contacts
                and len(end_contacts) == 1
                and surface_criterion):
                return True
        return False


    @staticmethod
    def push_object_into(obj_name, src_body, src_link, dest_body, dest_link, start_info=None, end_info=None):
        """
        obj_name is either a list of object names or a string
        Returns True if the object / any of the objects changes contact from src_body to dest_body.
        The robot may neither touch the object at start nor end.
        """
        if isinstance(obj_name, (list, ListConfig)):
            return any(
                LowLevelTasks.push_object_into(ob, src_body, src_link, dest_body, dest_link, start_info, end_info)
                for ob in obj_name
            )
        
        robot_uid = start_info["robot_info"]["uid"]

        src_uid = start_info["scene_info"]["fixed_objects"][src_body]["uid"]
        src_link_id = start_info["scene_info"]["fixed_objects"][src_body]["links"][src_link]
        dest_uid = end_info["scene_info"]["fixed_objects"][dest_body]["uid"]
        dest_link_id = end_info["scene_info"]["fixed_objects"][dest_body]["links"][dest_link]

        start_contacts = set((c[2], c[4]) for c in start_info["scene_info"]["movable_objects"][obj_name]["contacts"])
        end_contacts = set((c[2], c[4]) for c in end_info["scene_info"]["movable_objects"][obj_name]["contacts"])
        
        return (
            robot_uid not in start_contacts | end_contacts
            and len(start_contacts) == 1
            and (src_uid, src_link_id) in start_contacts
            and (dest_uid, dest_link_id) in end_contacts
        )

    @staticmethod
    def move_door_abs(joint_name, start_threshold, end_threshold, start_info, end_info):
        """
        Returns True if the joint specified by 'obj_name' and 'joint_name' (e.g. a door or drawer)
        is moved from at least 'start_threshold' to 'end_threshold'.
        """
        # TODO: ADD THAT OBJECT IS IN CONTACT WITH GRIPPER ALEADY

        start_joint_state = start_info["scene_info"]["doors"][joint_name]["current_state"][0]
        end_joint_state = end_info["scene_info"]["doors"][joint_name]["current_state"][0]

        if start_threshold < end_threshold:
            return start_joint_state < start_threshold < end_threshold < end_joint_state
        elif start_threshold > end_threshold:
            return start_joint_state > start_threshold > end_threshold > end_joint_state
        else:
            raise ValueError

    @staticmethod
    def move_door_rel(joint_name, threshold, start_info, end_info):
        """
        Returns True if the joint specified by 'obj_name' and 'joint_name' (e.g. a door or drawer)
        is moved from at least 'start_threshold' to 'end_threshold'.
        """
        robot_contacts_start = set(c[4] for c in start_info["robot_info"]["contacts"])

        if joint_name == 'base__drawer':
            drawer_link_id = start_info['scene_info']['fixed_objects']['table']['links']['drawer_link']
            if drawer_link_id not in robot_contacts_start:
                return False
        elif joint_name == 'base__slide':
            slider_link_id = start_info['scene_info']['fixed_objects']['table']['links']['slide_link']
            if slider_link_id not in robot_contacts_start:
                return False


        start_joint_state = start_info["scene_info"]["doors"][joint_name]["current_state"]
        end_joint_state = end_info["scene_info"]["doors"][joint_name]["current_state"]

        return (
            0 < threshold < end_joint_state - start_joint_state or 0 > threshold > end_joint_state - start_joint_state
        )

    @staticmethod
    def place_gripper_over_block(object_name, start_info=None, end_info=None):
        # Before grasping a block, check for whether the gripper is near the block.
        # TODO: Make sure the gripper isn't moving over a closed drawer with blocks inside.
        target_block_start = [v for k, v in start_info["scene_info"]["movable_objects"].items() if k == object_name]
        target_block_end = [v for k, v in end_info["scene_info"]["movable_objects"].items() if k == object_name]

        if len(target_block_start) != 1 or len(target_block_end) != 1:
            return False
        
        robot_contacts_start = set(c[2] for c in start_info["robot_info"]["contacts"])
        robot_contacts_end = set(c[2] for c in end_info["robot_info"]["contacts"])

        both_contacts_empty = len(robot_contacts_start) == 0 and len(robot_contacts_end) == 0
        only_1_contact = len(robot_contacts_start) == 1 and len(robot_contacts_end) == 1 and list(robot_contacts_start)[0] == list(robot_contacts_end)[0]
        # contacts should stay the same
        if not (both_contacts_empty or only_1_contact):
            return False

        start_gripper_pos = np.array(start_info['robot_info']['tcp_pos'])
        end_gripper_pos = np.array(end_info['robot_info']['tcp_pos'])

        target_block_start_pos = np.array(target_block_start[0]['current_pos'])
        # block in drawer and drawer is closed
        if block_in_drawer(target_block_start_pos) and is_drawer_closed(start_info['scene_info']['fixed_objects']['table']['links']['drawer_link']):
            return False
        
        target_block_end_pos = np.array(target_block_end[0]['current_pos'])

        # Make sure initial and final target block pos is approx equal
        block_movement = np.sum(np.abs(target_block_end_pos - target_block_start_pos))
        disp_eps = 0.001
        
        if block_movement > disp_eps:
            return False
        
        # Find x-y coord boundaries for where the gripper should reach (relative to the target block?)
        start_obj_gripper_diff = np.sqrt(np.sum(np.square(target_block_start_pos[:2] - start_gripper_pos[:2])))
        end_obj_gripper_diff = np.sqrt(np.sum(np.square(target_block_end_pos[:2] - end_gripper_pos[:2])))

        # Make sure gripper wasn't too close to the block in the first place.
        if start_obj_gripper_diff < REL_EPS:
            return False
        
        # Make sure gripper isn't too far from the block.
        if end_obj_gripper_diff > REL_EPS:
            return False
        
        return True
    
    @staticmethod
    def place_grasped_block_over_surface(dest_link, start_info=None, end_info=None):

        robot_uid = {start_info["robot_info"]["uid"]}

        for obj_name in start_info["scene_info"]["movable_objects"]:

            obj_start_info = start_info["scene_info"]["movable_objects"][obj_name]
            obj_end_info = end_info["scene_info"]["movable_objects"][obj_name]

            obj_start_pos = start_info["scene_info"]["movable_objects"][obj_name]['current_pos']
            obj_end_pos = end_info["scene_info"]["movable_objects"][obj_name]['current_pos']

            obj_start_contacts = set(c[2] for c in obj_start_info["contacts"])
            obj_end_contacts = set(c[2] for c in obj_end_info["contacts"])

            if dest_link == 'plank_link':
                if not in_slider_area(obj_end_pos) or in_slider_area(obj_start_pos):
                    continue

            elif dest_link == 'drawer_link':
                if not in_drawer_area(obj_end_pos) or in_drawer_area(obj_start_pos):
                    continue

            elif dest_link == 'table_link':
                if not in_table_area(obj_end_pos) or (is_block_over_other_block(obj_name, end_info)) or (in_table_area(obj_start_pos) and not is_block_over_other_block(obj_name, start_info)):
                    continue

            # Robot still holding the object
            if len(obj_start_contacts) == 1 and len(obj_end_contacts) == 1 and \
                len(obj_end_contacts - robot_uid) == 0 and len(obj_start_contacts - robot_uid) == 0:
                return True
        return False

    @staticmethod
    def grasp_block(obj_name, start_info=None, end_info=None):
        """
        Grasp a block nearby the gripper, specified by object_name. Typically called after 
        place_gripper_over_block.
        """
        # TODO: WORK IN PROGRESS, WHAT OTHER CRITERIA FOR GRASPING OTHER THAN CONTACT?
        #       HOW TO MAKE SURE BLOCK CAN BE LIFTED? GRIPPER WIDTH?
        #if not is_gripper_near_block(obj_name, state_info=start_info):
        #    return False
        robot_contacts_start = set(c[2] for c in start_info["robot_info"]["contacts"])
        robot_contacts_end = set(c[2] for c in end_info["robot_info"]["contacts"])

        if len(robot_contacts_start) > 0:
            #print("Robot is touching something in the beginning!")
            return False

        #print("Contact passed!")
        obj_start_info = start_info["scene_info"]["movable_objects"][obj_name]
        obj_end_info = end_info["scene_info"]["movable_objects"][obj_name]
        
        # check the block wasn't lifted from the surface it was on in the beginning
        for surface_link in ['drawer_link', 'plank_link', 'base_link']:
            surface_uid = start_info["scene_info"]["fixed_objects"]['table']["uid"]
            surface_link_id = start_info["scene_info"]["fixed_objects"]['table']["links"][surface_link]

            start_contacts_links = set((c[2], c[4]) for c in obj_start_info["contacts"])
            end_contacts_links = set((c[2], c[4]) for c in obj_end_info["contacts"])
            #print("start links:", start_contacts_links)
            #print("end links: ", end_contacts_links)
            if (surface_uid, surface_link_id) in start_contacts_links and (surface_uid, surface_link_id) not in end_contacts_links:
                return False
        #print("Surface criterion passed!")

        # the block wasn't moved/lifted too much
        start_pos = np.array(obj_start_info["current_pos"])
        end_pos = np.array(obj_end_info["current_pos"])
        pos_diff = np.abs(end_pos - start_pos).sum()
        if pos_diff > EPSILON_BLOCK_POS:
            return False
        
        end_gripper_width = end_info["robot_info"]["gripper_opening_width"]
        #print(end_gripper_width)
        if not (0.03 < end_gripper_width < 0.06):
            return False

        return is_block_grasped(obj_name, end_info) 


    @staticmethod   
    def grasp_slider(start_info=None, end_info=None):
        robot_contacts_start = set(c[4] for c in start_info["robot_info"]["contacts"])
        robot_contacts_end = set(c[4] for c in end_info["robot_info"]["contacts"])

        if len(robot_contacts_start) > 0:
            return False
        if len(robot_contacts_end) != 1:
            return False

        if LowLevelTasks.move_door_rel('base__slide', 0.15, start_info, end_info) or LowLevelTasks.move_door_rel('base__slide', -0.15, start_info, end_info):
            return False
        slider_link_id = start_info['scene_info']['fixed_objects']['table']['links']['slide_link']

        if list(robot_contacts_end)[0] == slider_link_id:
            return True
        return False

    @staticmethod   
    def grasp_drawer(start_info=None, end_info=None):
        robot_contacts_start = set(c[4] for c in start_info["robot_info"]["contacts"])
        robot_contacts_end = set(c[4] for c in end_info["robot_info"]["contacts"])

        if len(robot_contacts_start) > 0:
            return False
        if len(robot_contacts_end) != 1:
            return False

        if LowLevelTasks.move_door_rel('base__drawer', 0.12, start_info, end_info) or LowLevelTasks.move_door_rel('base__drawer', -0.12, start_info, end_info):
            return False

        drawer_link_id = start_info['scene_info']['fixed_objects']['table']['links']['drawer_link']

        if list(robot_contacts_end)[0] == drawer_link_id:
            return True

        return False

    @staticmethod   
    def ungrasp_drawer(start_info=None, end_info=None):
        robot_contacts_start = set(c[4] for c in start_info["robot_info"]["contacts"])
        robot_contacts_end = set(c[4] for c in end_info["robot_info"]["contacts"])
        drawer_link_id = start_info['scene_info']['fixed_objects']['table']['links']['drawer_link']

        if len(robot_contacts_start) != 1 or list(robot_contacts_start)[0] != drawer_link_id:
            return False
        
        if LowLevelTasks.move_door_rel('base__drawer', EPSILON_SLIDER_DRAWER, start_info, end_info) or LowLevelTasks.move_door_rel('base__drawer', -EPSILON_SLIDER_DRAWER, start_info, end_info):
            return False

        if len(robot_contacts_end) == 0:
            return True

        return False

    @staticmethod   
    def ungrasp_slider(start_info=None, end_info=None):
        robot_contacts_start = set(c[4] for c in start_info["robot_info"]["contacts"])
        robot_contacts_end = set(c[4] for c in end_info["robot_info"]["contacts"])
        slider_link_id = start_info['scene_info']['fixed_objects']['table']['links']['slide_link']

        if len(robot_contacts_start) != 1 or list(robot_contacts_start)[0] != slider_link_id:
            return False
        
        if LowLevelTasks.move_door_rel('base__slide', EPSILON_SLIDER_DRAWER, start_info, end_info) or LowLevelTasks.move_door_rel('base__slide', -EPSILON_SLIDER_DRAWER, start_info, end_info):
            return False

        if len(robot_contacts_end) == 0:
            return True
        
        return False


    @staticmethod   
    def ungrasp_block(object_names, start_info=None, end_info=None):
        """
        Grasp a block nearby the gripper, specified by object_name. Typically called after 
        place_gripper_over_block.
        """

        # TODO: WORK IN PROGRESS, WHAT OTHER CRITERIA FOR GRASPING OTHER THAN CONTACT?
        #       HOW TO MAKE SURE THE BLOCK CAN BE LIFTED? GRIPPER WIDTH?
        for object_name in object_names:
            if is_block_grasped(object_name, start_info) and not is_block_grasped(object_name, end_info):
                print("The grasped block is ", object_name)
                obj_start_info = start_info["scene_info"]["movable_objects"][object_name]
                obj_end_info = end_info["scene_info"]["movable_objects"][object_name]
                start_pos = np.array(obj_start_info["current_pos"])
                end_pos = np.array(obj_end_info["current_pos"])
                pos_xy_diff = np.abs(end_pos[:2] - start_pos[:2]).sum()
                if pos_xy_diff > REL_EPS:
                    print(f"{object_name} moved xy too much! by {pos_xy_diff}")
                    continue
                
                start_orn = R.from_quat(obj_start_info["current_orn"])
                end_orn = R.from_quat(obj_end_info["current_orn"])
                rotation = end_orn * start_orn.inv()
                x, y, z = rotation.as_euler("xyz", degrees=True)
                z_degrees = 45
                if z > z_degrees:
                    print(f"{object_name} rotated too much! by {z}")
                    continue

                z_degrees = -45
                if z < z_degrees:
                    print(f"{object_name} rotated too much! by {z}")
                    continue

                return True
        return False
    
    @staticmethod
    def contact_block(block_name, side, start_info, end_info):
        """
            1) The robot should not contact anything in the beginning.
            2) In the end, the robot should contact the correct block on the correct side (left or right)
                (relative to the static camera)
            3) Also, the block should not be lifted.
        """
        robot_contacts_start = [c[2] for c in start_info["robot_info"]["contacts"]]
        robot_contacts_end = [c[2] for c in end_info["robot_info"]["contacts"]]
        robot_contacts_links = [c[3] for c in end_info["robot_info"]["contacts"]]

        if len(robot_contacts_start) > 0:
            return False
        
        if len(robot_contacts_end) != 1:
            return False
        
        block_end_info = end_info["scene_info"]["movable_objects"][block_name]
        block_end_contacts = set([c[2] for c in block_end_info["contacts"]])

        robot_uid = {start_info["robot_info"]["uid"]}
        # Robot is touching the block and there are other contacts too (e.g. floor)
        if not (len(block_end_contacts) > 1 and len(block_end_contacts - robot_uid) < len(block_end_contacts)):
            return False

        
        block_start_info = start_info["scene_info"]["movable_objects"][block_name]

        start_pos = np.array(block_start_info["current_pos"])
        end_pos = np.array(block_end_info["current_pos"])
        pos_diff = np.abs(end_pos - start_pos).sum()

        if pos_diff > EPSILON_BLOCK_POS:
            return False

        left_gripper_outer = 11
        right_gripper_outer = 9
        # Right/left side of cube
        if side == 'right' and robot_contacts_links[0] == left_gripper_outer:
            return True
        elif side == 'left' and robot_contacts_links[0] == right_gripper_outer:
            return True
        else:
            return False
        
    @staticmethod
    def toggle_light(light_name, start_state, end_state, start_info, end_info):
        # TODO: Add that the robot needs to be over the switch already?
        return (
            start_info["scene_info"]["lights"][light_name]["logical_state"] == start_state
            and end_info["scene_info"]["lights"][light_name]["logical_state"] == end_state
        )

def is_block_grasped(obj_name, state_info=None):
        robot_contacts_end = np.array([c[2] for c in state_info["robot_info"]["contacts"]])

        if len(robot_contacts_end) == 0:
            return False
        
        #print("Robot contacts end: ", robot_contacts_end)
        target_obj_uid = state_info["scene_info"]["movable_objects"][obj_name]['uid']
        #print("Target uid: ", target_obj_uid)
        # At least 2 points of contact with the target block
        if np.sum(robot_contacts_end == target_obj_uid) < 2:
            return False
        return True

def is_gripper_near_block(obj_name, state_info=None):
        target_block_data = [v for k, v in state_info["scene_info"]["movable_objects"].items() if k == obj_name]

        if len(target_block_data) != 1:
            return False
        
        gripper_pos = np.array(state_info['robot_info']['tcp_pos'])
        target_block_pos = np.array(target_block_data[0]['current_pos'])
        start_obj_gripper_diff = np.sqrt(np.sum(np.square(target_block_pos[:2] - gripper_pos[:2])))
        return start_obj_gripper_diff < REL_EPS

def is_block_over_other_block(block_name, state_info=None):

    target_block_data = [v for k, v in state_info["scene_info"]["movable_objects"].items() if k == block_name]
    other_blocks_data = [v for k, v in state_info["scene_info"]["movable_objects"].items() if k != block_name]

    target_block_pos = np.array(target_block_data[0]['current_pos'])
    other_block_pos_1 = np.array(other_blocks_data[0]['current_pos'])
    other_block_pos_2 = np.array(other_blocks_data[0]['current_pos'])

    target_other_1_diff = np.sqrt(np.sum(np.square(target_block_pos[:2] - other_block_pos_1[:2])))
    target_other_2_diff = np.sqrt(np.sum(np.square(target_block_pos[:2] - other_block_pos_2[:2])))
    # Rectangle (pink) largest side is 0.1 in length 
    return target_other_1_diff < 0.1 or target_other_2_diff < 0.1


def block_in_drawer(block_coord):
    return 0.357 < block_coord[2] < 0.363

def is_drawer_closed(drawer_link_val):
    return drawer_link_val <= 0.03

def in_table_area(xyz):
    x, y, z = xyz[0], xyz[1], xyz[2]
    x_match = (x >= -0.38 and x <= 0.35)
    y_match = (y >=- 0.14 and y <= -0.03)
    return x_match and y_match


def in_slider_area(xyz):
    x, y, z = xyz[0], xyz[1], xyz[2]
    x_match = (x >= -0.38 and x <= 0.18)
    y_match = (y >= 0.05 and y <= 0.12)
    return x_match and y_match

def in_drawer_area(xyz):
    x, y, z = xyz[0], xyz[1], xyz[2]
    #x_match = (x >= -0.38 and x <= 0.18)
    y_match = y < -0.15
    return y_match


### Low-level tasks one-by-one

In [None]:
%matplotlib inline
from collections import Counter
import matplotlib.pyplot as plt
import time
import cv2
from IPython.display import clear_output



def evaluate_policy(model, env, cfg, processor, num_videos=0, save_dir=None):
    #task_oracle_low_level = hydra.utils.instantiate(cfg.low_level_tasks)
    task_oracle_high_level = hydra.utils.instantiate(cfg.tasks)

    task_oracle_low_level = LowLevelTasks(cfg.low_level_tasks.tasks)

    print("The oracle: ", task_oracle_low_level)
    val_annotations_low_level = cfg.low_level_annotations
    val_annotations_high_level = cfg.annotations

    # video stuff
    if num_videos > 0:
        rollout_video = RolloutVideo(
            logger=logger,
            empty_cache=False,
            log_to_file=True,
            save_dir=save_dir,
            resolution_scale=1,
        )
    else:
        rollout_video = None

    eval_sequences = get_low_level_sequences(cfg.num_sequences)
    #for seq in eval_sequences:
        #print(seq[0])
        #print(seq[1])
        #if seq[1][0] == 'stack_block':
        #    init_state = seq[0]
        #    if init_state['blue_block'] == 'slider_left' and init_state['slider'] == 'left' and seq[1][1][2] == 'place_grasped_block_over_blue_block':
        #        raise Exception('Invalid state combo!')

    results = []
    
    high_level_started = Counter()
    high_level_completed = Counter()
    low_level_started = Counter()
    low_level_completed = Counter()
    counters = {
        'high_level_started': high_level_started,
        'high_level_completed': high_level_completed,
        'low_level_started': low_level_started,
        'low_level_completed': low_level_completed,
    }

    if not cfg.debug:
        eval_sequences = tqdm(eval_sequences, position=0, leave=True)
    for i, (initial_state, eval_sequence) in enumerate(eval_sequences):
        record = i < num_videos
        #initial_state = {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
        #initial_state = {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
        #eval_sequence = ('stack_block', ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_pink_block', 'ungrasp_block'])

        high_level_task, eval_seq = eval_sequence
        high_level_started[high_level_task] += 1

        result = evaluate_sequence(
            env, model, task_oracle_low_level, initial_state, eval_seq, val_annotations_low_level, cfg, processor, record, rollout_video, i,
            low_level_started=low_level_started, low_level_completed=low_level_completed
        )

        results.append(result)
        if result == len(eval_seq):
            high_level_completed[high_level_task] += 1

        if record:
            rollout_video.write_to_tmp()
            #print("Terminating early!")
            #break
        if not cfg.debug:
            success_rates = count_success(results)
            average_rate = sum(success_rates) / len(success_rates) * 5
            description = " ".join([f"{i + 1}/5 : {v * 100:.1f}% |" for i, v in enumerate(success_rates)])
            description += f" Average: {average_rate:.1f} |"
            eval_sequences.set_description(description)

    if num_videos > 0:
        # log rollout videos
        rollout_video._log_videos_to_file(0, save_as_video=False)
    print("High_started: ", high_level_started)
    print("High_completed: ", high_level_completed)
    print("Low_started: ", low_level_started)
    print("Low_completed: ", low_level_completed)


    return results, average_rate, success_rates, counters

def join_vis_lang_jup(img, lang_text):
    """Takes as input an image and a language instruction and visualizes them with cv2"""
    img = img.copy()
    img = cv2.resize(img, (500, 500))
    print("visualizing!")
    #clear_output(wait=True)
    add_text(img, lang_text)
    plt.imshow(img)
    plt.show()
    time.sleep(0.1)


def evaluate_sequence(
    env, model, task_checker, initial_state, eval_sequence, val_annotations, cfg, processor, record, rollout_video, i, 
    low_level_started, low_level_completed
):
    robot_obs, scene_obs = get_env_state_for_initial_condition(initial_state)
    env.reset(robot_obs=robot_obs, scene_obs=scene_obs)
    if record:
        caption = " | ".join(eval_sequence)
        rollout_video.new_video(tag=get_video_tag(i), caption=caption)

    success_counter = 0
    if cfg.debug:
        time.sleep(1)
        print()
        print()
        print(f"Evaluating sequence: {' -> '.join(eval_sequence)}")
        print("Subtask: ", end="")
    print("Evaluating sequence: ", eval_sequence)
    print('Initial state: ', initial_state)
    for subtask in eval_sequence:
        low_level_started[subtask] += 1
        print("Evaluating task ", subtask)
        if record:
            rollout_video.new_subtask()
        success = rollout(env, model, task_checker, cfg, subtask, val_annotations, processor=processor, record=record, rollout_video=rollout_video)
        if record:
            rollout_video.draw_outcome(success)
        if success:
            low_level_completed[subtask] += 1
            success_counter += 1
        else:
            return success_counter
    return success_counter

def rollout(env, model, task_oracle, cfg, subtask, val_annotations, processor, record=False, rollout_video=None):
    if cfg.debug:
        print(f"{subtask} ", end="")
        time.sleep(0.5)
    obs = env.get_obs()
    # get lang annotation for subtask
    lang_annotation = val_annotations[subtask][0]
    #print("Instruction: ", lang_annotation)
    #print("Setting a different instruction!")
    #lang_annotation = 'put the blue block on top of red block'
    # get language goal embedding
    if processor == None:
        print(lang_annotation)
        tasks = model.create_tasks(texts=[lang_annotation])
        from octo.utils.train_callbacks import supply_rng
        policy_fn = supply_rng(
                partial(
                    model.sample_actions,
                    unnormalization_statistics=model.dataset_statistics["action"],
                ),
            )
        window_size = 4
        act_step = 4

    #model.reset()
    start_info = env.get_info()
    past_obs = None

    for step in range(cfg.ep_len):
        if processor == None:
            if act_step > 0 and act_step % window_size == 0:
                act_step = 0
                
                static_2 = resize_image(obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True)
                gripper_2 = resize_image(obs['rgb_obs']['rgb_gripper'], (128, 128))
                if past_obs:
                    static_1 = resize_image(past_obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True)
                    gripper_1 = resize_image(past_obs['rgb_obs']['rgb_gripper'], (128, 128))
                    image_primary = np.stack([static_1, static_2])
                    image_wrist = np.stack([gripper_1, gripper_2])
                    timestep_pad_mask = np.array([[True, True]])

                else:
                    image_primary = np.stack([np.zeros((256, 256, 3)), static_2])
                    image_wrist = np.stack([np.zeros((128, 128, 3)), gripper_2])
                    timestep_pad_mask = np.array([[False, True]])
                pad_mask_dict = {
                    "image_primary": np.array([[True, True]]),
                    #"image_wrist": np.array([[True, True]]),
                    "timestep": np.array([[False, False]]),
                }
                
                #image_primary = np.expand_dims(resize_image(obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True), 0)
                #image_wrist = np.expand_dims(resize_image(obs['rgb_obs']['rgb_gripper'], (128, 128)), 0)
                #timestep_pad_mask = np.array([[True]])
                #pad_mask_dict = {
                #    "image_primary": np.array([[True]]),
                    #"image_wrist": np.array([[True]]),
                #    "timestep": np.array([[True]]),
                #}
                observation = {
                        "image_primary": np.expand_dims(image_primary, 0),  # uint8
                        #"image_wrist": np.expand_dims(image_wrist, 0),      # uint8
                        "timestep_pad_mask": timestep_pad_mask,
                        "pad_mask_dict": pad_mask_dict,
                        "timestep": np.array([[step-1, step]]),
                }
                if 'wrist' in cfg.image_obs_keys:
                    observation['image_wrist'] = np.expand_dims(image_wrist, 0)
                    pad_mask_dict["image_wrist"] = np.array([[True, True]])

                act_buffer = policy_fn(observation, tasks)
                act_buffer = np.array(act_buffer[0])
                action = act_buffer[act_step]
            else:
                action = act_buffer[act_step]
            act_step += 1

        else:
            
            observation = {
                'full_image': resize_image(obs['rgb_obs']['rgb_static'], (224, 224))
            }

            action = get_action(
                cfg,
                model,
                observation,
                task_label=lang_annotation,
                processor=processor,
            )
        
        # Rescale gripper actions from [0, 1] to [-1, 1] and binarize
        action = normalize_gripper_action(action)
        past_obs = obs
        obs, _, _, current_info = env.step(action)

        #img = env.render(mode="rgb_array")
        #join_vis_lang_jup(img, lang_annotation)


        if record:
            # update video
            frame_aug = torch.zeros((3, 224, 448))
            resize = Resize(224, antialias=True)
            frame_aug[:, :, :224] = resize(torch.tensor(obs["rgb_obs"]["rgb_static"]).permute(2, 0, 1))
            closest_obs = 0
            if isinstance(closest_obs, int):
                closest_obs = torch.zeros((3, 224, 224))
            frame_aug[:, :, 224:] = closest_obs.squeeze()

            rollout_video.update(frame_aug.unsqueeze(0).unsqueeze(0), step=step)

        # check if current step solves a task
        current_task_info = task_oracle.get_task_info_for_set(start_info, current_info, {subtask})

        if len(current_task_info) > 0:
            if cfg.debug:
                print(colored("success", "green"), end=" ")
            if record:
                rollout_video.add_language_instruction(lang_annotation)
            return True
    if cfg.debug:
        print(colored("fail", "red"), end=" ")
    if record:
        rollout_video.add_language_instruction(lang_annotation)
    return False


In [7]:
calvin_cfg.ep_len = 200
calvin_cfg.num_sequences = 50
calvin_cfg.num_videos = 50

results, average_rate, success_rates, counters = evaluate_policy(model, env, calvin_cfg, 
                processor, 
                num_videos=calvin_cfg.num_videos, 
                save_dir=video_save_dir
                )


The oracle:  <__main__.LowLevelTasks object at 0x72fbf4fc9450>


  0%|          | 0/50 [00:00<?, ?it/s]

Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_slider
grasp the slider handle


1/5 : 0.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.0 |:   2%|▏         | 1/50 [00:43<35:51, 43.91s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 50.0% | 2/5 : 50.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.0 |:   4%|▍         | 2/50 [00:51<18:07, 22.66s/it]

Evaluating sequence:  ['contact_pink_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_pink_block_left
touch the pink block on its left side


1/5 : 33.3% | 2/5 : 33.3% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.7 |:   6%|▌         | 3/50 [01:15<18:15, 23.31s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 50.0% | 2/5 : 25.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:   8%|▊         | 4/50 [01:50<21:24, 27.93s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block
Evaluating task  lift_grasped_block
lift the grasped block
Evaluating task  place_grasped_block_over_slider
place the grasped block over the sliding cabinet
Evaluating task  ungrasp_block
ungrasp the block
The grasped block is  block_red
block_red moved xy too much! by 0.1831850009859753
The grasped block is  block_red
block_red moved xy too much! by 0.18586364002968456
The grasped block is  block_red
block_red moved xy too much! by 0.18667560895690316
The grasped block is  block_red
block_red moved xy too much! by 0.18661662903361378
The grasped block is  block_red
blo

1/5 : 60.0% | 2/5 : 40.0% | 3/5 : 20.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.2 |:  10%|█         | 5/50 [02:53<30:25, 40.56s/it]

Evaluating sequence:  ['grasp_red_block', 'rotate_grasped_block_right', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block


1/5 : 50.0% | 2/5 : 33.3% | 3/5 : 16.7% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.0 |:  12%|█▏        | 6/50 [03:18<25:54, 35.34s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 57.1% | 2/5 : 28.6% | 3/5 : 14.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.0 |:  14%|█▍        | 7/50 [03:44<23:01, 32.13s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block


1/5 : 50.0% | 2/5 : 25.0% | 3/5 : 12.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  16%|█▌        | 8/50 [04:03<19:34, 27.97s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block


1/5 : 44.4% | 2/5 : 22.2% | 3/5 : 11.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  18%|█▊        | 9/50 [04:17<16:12, 23.72s/it]

Evaluating sequence:  ['grasp_blue_block', 'rotate_grasped_block_left', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  rotate_grasped_block_left
rotate the grasped block 90 degrees to the left
Evaluating task  ungrasp_block
ungrasp the block


1/5 : 50.0% | 2/5 : 30.0% | 3/5 : 20.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.0 |:  20%|██        | 10/50 [04:26<12:42, 19.07s/it]

The grasped block is  block_blue
Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_slider
grasp the slider handle


1/5 : 45.5% | 2/5 : 27.3% | 3/5 : 18.2% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  22%|██▏       | 11/50 [04:43<12:00, 18.49s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block', 'place_grasped_block_over_pink_block', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block


1/5 : 41.7% | 2/5 : 25.0% | 3/5 : 16.7% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  24%|██▍       | 12/50 [04:59<11:05, 17.52s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_drawer', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 46.2% | 2/5 : 23.1% | 3/5 : 15.4% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  26%|██▌       | 13/50 [05:32<13:46, 22.33s/it]

Evaluating sequence:  ['contact_blue_block_right', 'push_block_left']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_blue_block_right
touch the blue block on its right side
Evaluating task  push_block_left
push the block towards the left


1/5 : 50.0% | 2/5 : 21.4% | 3/5 : 14.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  28%|██▊       | 14/50 [06:02<14:44, 24.57s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 53.3% | 2/5 : 20.0% | 3/5 : 13.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  30%|███       | 15/50 [06:25<14:11, 24.32s/it]

Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_slider
grasp the slider handle
Evaluating task  move_slider_left
move the handle to the left
Evaluating task  ungrasp_slider
ungrasp the slider handle


1/5 : 56.2% | 2/5 : 25.0% | 3/5 : 12.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  32%|███▏      | 16/50 [07:05<16:20, 28.85s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 58.8% | 2/5 : 23.5% | 3/5 : 11.8% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  34%|███▍      | 17/50 [07:26<14:37, 26.60s/it]

Evaluating sequence:  ['contact_red_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_red_block_left
touch the red block on its left side


1/5 : 55.6% | 2/5 : 22.2% | 3/5 : 11.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  36%|███▌      | 18/50 [07:50<13:49, 25.91s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block', 'place_grasped_block_over_drawer', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 57.9% | 2/5 : 21.1% | 3/5 : 10.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  38%|███▊      | 19/50 [08:14<13:04, 25.30s/it]

Evaluating sequence:  ['contact_blue_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_blue_block_left
touch the blue block on its left side


1/5 : 55.0% | 2/5 : 20.0% | 3/5 : 10.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  40%|████      | 20/50 [08:31<11:21, 22.73s/it]

Evaluating sequence:  ['contact_pink_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_pink_block_left
touch the pink block on its left side


1/5 : 52.4% | 2/5 : 19.0% | 3/5 : 9.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  42%|████▏     | 21/50 [08:46<09:52, 20.43s/it] 

Evaluating sequence:  ['contact_blue_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_blue_block_left
touch the blue block on its left side


1/5 : 50.0% | 2/5 : 18.2% | 3/5 : 9.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  44%|████▍     | 22/50 [09:08<09:42, 20.80s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 52.2% | 2/5 : 17.4% | 3/5 : 8.7% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  46%|████▌     | 23/50 [09:34<10:01, 22.27s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_left', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 54.2% | 2/5 : 16.7% | 3/5 : 8.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  48%|████▊     | 24/50 [10:02<10:27, 24.15s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_red_block', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 56.0% | 2/5 : 16.0% | 3/5 : 8.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  50%|█████     | 25/50 [10:35<11:10, 26.83s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 57.7% | 2/5 : 19.2% | 3/5 : 7.7% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  52%|█████▏    | 26/50 [10:42<08:16, 20.69s/it]

Evaluating sequence:  ['contact_pink_block_right', 'push_block_left']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_pink_block_right
touch the pink block on its right side


1/5 : 55.6% | 2/5 : 18.5% | 3/5 : 7.4% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  54%|█████▍    | 27/50 [11:11<08:54, 23.26s/it]

Evaluating sequence:  ['contact_red_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_red_block_left
touch the red block on its left side


1/5 : 53.6% | 2/5 : 17.9% | 3/5 : 7.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  56%|█████▌    | 28/50 [11:34<08:33, 23.35s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 55.2% | 2/5 : 20.7% | 3/5 : 6.9% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  58%|█████▊    | 29/50 [11:38<06:09, 17.60s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 56.7% | 2/5 : 23.3% | 3/5 : 6.7% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  60%|██████    | 30/50 [11:43<04:35, 13.75s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 58.1% | 2/5 : 22.6% | 3/5 : 6.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  62%|██████▏   | 31/50 [12:48<09:10, 28.96s/it]

Evaluating sequence:  ['grasp_slider', 'move_slider_right', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_slider
grasp the slider handle


1/5 : 56.2% | 2/5 : 21.9% | 3/5 : 6.2% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  64%|██████▍   | 32/50 [13:09<07:58, 26.56s/it]

Evaluating sequence:  ['grasp_slider', 'move_slider_right', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_slider
grasp the slider handle


1/5 : 54.5% | 2/5 : 21.2% | 3/5 : 6.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  66%|██████▌   | 33/50 [13:28<06:56, 24.52s/it]

Evaluating sequence:  ['grasp_drawer', 'open_drawer', 'ungrasp_drawer']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_drawer
grasp the drawer handle
Evaluating task  open_drawer
pull the handle


1/5 : 55.9% | 2/5 : 20.6% | 3/5 : 5.9% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.8 |:  68%|██████▊   | 34/50 [13:54<06:39, 24.95s/it]

Evaluating sequence:  ['grasp_blue_block', 'rotate_grasped_block_right', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  rotate_grasped_block_right
rotate the grasped block 90 degrees to the right
Evaluating task  ungrasp_block
ungrasp the block


1/5 : 57.1% | 2/5 : 22.9% | 3/5 : 8.6% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  70%|███████   | 35/50 [14:11<05:38, 22.57s/it]

The grasped block is  block_blue
Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 58.3% | 2/5 : 25.0% | 3/5 : 8.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  72%|███████▏  | 36/50 [14:14<03:54, 16.72s/it]

Evaluating sequence:  ['contact_blue_block_right', 'push_block_left']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_blue_block_right
touch the blue block on its right side
Evaluating task  push_block_left
push the block towards the left


1/5 : 59.5% | 2/5 : 24.3% | 3/5 : 8.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  74%|███████▍  | 37/50 [15:20<06:45, 31.21s/it]

Evaluating sequence:  ['contact_red_block_right', 'push_into_drawer']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_red_block_right
touch the red block on its right side


1/5 : 57.9% | 2/5 : 23.7% | 3/5 : 7.9% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  76%|███████▌  | 38/50 [15:43<05:47, 28.99s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_drawer', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block
Evaluating task  place_grasped_block_over_drawer
place the grasped block over the drawer
Evaluating task  ungrasp_block
ungrasp the block
The grasped block is  block_blue
block_blue moved xy too much! by 0.24641285849208233
The grasped block is  block_blue
block_blue moved xy too much! by 0.24851508198298983
The grasped block is  block_blue
block_blue moved xy too much! by 0.24984228688560967
The grasped block is  block_blue
block_blue moved xy too much! by 0.2503087179639859
The grasped block is  block_blue
bl

1/5 : 59.0% | 2/5 : 25.6% | 3/5 : 10.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  78%|███████▊  | 39/50 [17:03<08:06, 44.23s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block


1/5 : 57.5% | 2/5 : 25.0% | 3/5 : 10.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  80%|████████  | 40/50 [17:28<06:25, 38.53s/it]

Evaluating sequence:  ['grasp_red_block', 'rotate_grasped_block_left', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block
Evaluating task  rotate_grasped_block_left
rotate the grasped block 90 degrees to the left


1/5 : 58.5% | 2/5 : 24.4% | 3/5 : 9.8% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  82%|████████▏ | 41/50 [17:56<05:16, 35.21s/it] 

Evaluating sequence:  ['contact_red_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_red_block_left
touch the red block on its left side


1/5 : 57.1% | 2/5 : 23.8% | 3/5 : 9.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  84%|████████▍ | 42/50 [18:26<04:28, 33.62s/it]

Evaluating sequence:  ['grasp_drawer', 'close_drawer', 'ungrasp_drawer']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_drawer
grasp the drawer handle
Evaluating task  close_drawer
push the handle 


1/5 : 58.1% | 2/5 : 23.3% | 3/5 : 9.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  86%|████████▌ | 43/50 [19:08<04:13, 36.23s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 59.1% | 2/5 : 22.7% | 3/5 : 9.1% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  88%|████████▊ | 44/50 [19:37<03:23, 33.92s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block', 'place_grasped_block_over_pink_block', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_red_block
grasp the red block
Evaluating task  lift_grasped_block
lift the grasped block
Evaluating task  place_grasped_block_over_pink_block
place the grasped block over the pink block


1/5 : 60.0% | 2/5 : 24.4% | 3/5 : 8.9% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  90%|█████████ | 45/50 [20:24<03:10, 38.02s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 60.9% | 2/5 : 26.1% | 3/5 : 8.7% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.0 |:  92%|█████████▏| 46/50 [20:33<01:56, 29.24s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_blue_block
grasp the blue block
Evaluating task  lift_grasped_block
lift the grasped block


1/5 : 61.7% | 2/5 : 25.5% | 3/5 : 8.5% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 1.0 |:  94%|█████████▍| 47/50 [20:58<01:24, 28.10s/it]

Evaluating sequence:  ['contact_pink_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'left', 'drawer': 'open', 'red_block': 'slider_left', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  contact_pink_block_left
touch the pink block on its left side


1/5 : 60.4% | 2/5 : 25.0% | 3/5 : 8.3% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  96%|█████████▌| 48/50 [21:26<00:56, 28.12s/it]

Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 1, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_slider
grasp the slider handle


1/5 : 59.2% | 2/5 : 24.5% | 3/5 : 8.2% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |:  98%|█████████▊| 49/50 [21:51<00:26, 26.98s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block', 'place_grasped_block_over_red_block', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 1, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
Evaluating task  grasp_pink_block
grasp the pink block
Evaluating task  lift_grasped_block
lift the grasped block
Evaluating task  place_grasped_block_over_red_block
place the grasped block over the red block


1/5 : 60.0% | 2/5 : 26.0% | 3/5 : 8.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.9 |: 100%|██████████| 50/50 [22:39<00:00, 27.19s/it]


THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_0_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_1_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_2_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_3_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_4_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long

2


In [6]:
infos = env.get_info()
print(infos['scene_info']['movable_objects']['block_red']['uid'])

2


### Low-level tasks conjunctions

In [None]:
from collections import Counter


def evaluate_policy(model, env, cfg, processor, num_videos=0, save_dir=None):
    task_oracle_high_level = hydra.utils.instantiate(cfg.tasks)

    val_annotations_low_level = cfg.low_level_annotations

    # video stuff
    if num_videos > 0:
        rollout_video = RolloutVideo(
            logger=logger,
            empty_cache=False,
            log_to_file=True,
            save_dir=save_dir,
            resolution_scale=1,
        )
    else:
        rollout_video = None

    eval_sequences = get_low_level_sequences(cfg.num_sequences)
    
    results = []
    plans = defaultdict(list)
    
    high_level_started = Counter()
    high_level_completed = Counter()
    counters = {
        'high_level_started': high_level_started,
        'high_level_completed': high_level_completed,
    }
    
    if not cfg.debug:
        eval_sequences = tqdm(eval_sequences, position=0, leave=True)
    for i, (initial_state, eval_sequence) in enumerate(eval_sequences):
        record = i < num_videos
        #eval_sequence = [['contact_pink_block_left', 'push_pink_block_right']]
        #initial_state = {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0}
        high_level_task, eval_seq = eval_sequence
        high_level_started[high_level_task] += 1
        result = evaluate_sequence(
            env, model, task_oracle_high_level, initial_state, high_level_task, eval_seq, val_annotations_low_level, cfg, processor, record, rollout_video, i
        )

        results.append(result)
        high_level_completed[high_level_task] += result

        if record:
            rollout_video.write_to_tmp()
        if not cfg.debug:
            success_rates = count_success(results)
            average_rate = sum(success_rates) / len(success_rates) * 5
            description = " ".join([f"{i + 1}/5 : {v * 100:.1f}% |" for i, v in enumerate(success_rates)])
            description += f" Average: {average_rate:.1f} |"
            eval_sequences.set_description(description)

    if num_videos > 0:
        # log rollout videos
        rollout_video._log_videos_to_file(0, save_as_video=False)
    return results, average_rate, success_rates, counters


def evaluate_sequence(
    env, model, task_checker, initial_state, high_level_task, eval_sequence, val_annotations, cfg, processor, record, rollout_video, i
):
    robot_obs, scene_obs = get_env_state_for_initial_condition(initial_state)
    env.reset(robot_obs=robot_obs, scene_obs=scene_obs)
    if record:
        caption = " | ".join(eval_sequence)
        rollout_video.new_video(tag=get_video_tag(i), caption=caption)

    if cfg.debug:
        time.sleep(1)
        print()
        print()
        print(f"Evaluating sequence: {' -> '.join(eval_sequence)}")
        print("Subtask: ", end="")

    print("Evaluating sequence: ", eval_sequence)
    print('Initial state: ', initial_state)

    full_instruction = ", then ".join([val_annotations[subtask][0] for subtask in eval_sequence])
    print("The instruction ", full_instruction)

    if record:
        rollout_video.new_subtask()

    success = rollout(env, model, task_checker, cfg, high_level_task, full_instruction, processor=processor, record=record, rollout_video=rollout_video)

    if record:
        rollout_video.draw_outcome(success)

    return int(success)


def rollout(env, model, task_oracle, cfg, high_level_task, lang_annotation, processor, record=False, rollout_video=None):
    if cfg.debug:
        print(f"{subtask} ", end="")
        time.sleep(0.5)
    obs = env.get_obs()
    # get lang annotation for subtask
    #print("Instruction: ", lang_annotation)
    #print("Setting a different instruction!")
    #lang_annotation = 'put the blue block on top of red block'
    # get language goal embedding
    if processor == None:
        tasks = model.create_tasks(texts=[lang_annotation])
        from octo.utils.train_callbacks import supply_rng
        policy_fn = supply_rng(
                partial(
                    model.sample_actions,
                    unnormalization_statistics=model.dataset_statistics["action"],
                ),
            )
        window_size = 4
        act_step = 4

    #model.reset()
    start_info = env.get_info()
    past_obs = None

    for step in range(cfg.ep_len):
        if processor == None:
            if act_step > 0 and act_step % window_size == 0:
                act_step = 0
                
                static_2 = resize_image(obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True)
                #gripper_2 = resize_image(obs['rgb_obs']['rgb_gripper'], (128, 128))
                if past_obs:
                    static_1 = resize_image(past_obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True)
                    #gripper_1 = resize_image(past_obs['rgb_obs']['rgb_gripper'], (128, 128))
                    image_primary = np.stack([static_1, static_2])
                    #image_wrist = np.stack([gripper_1, gripper_2])
                    timestep_pad_mask = np.array([[True, True]])

                else:
                    image_primary = np.stack([np.zeros((256, 256, 3)), static_2])
                    #image_wrist = np.stack([np.zeros((128, 128, 3)), gripper_2])
                    timestep_pad_mask = np.array([[False, True]])
                pad_mask_dict = {
                    "image_primary": np.array([[True, True]]),
                    #"image_wrist": np.array([[True, True]]),
                    "timestep": np.array([[False, False]]),
                }
                
                #image_primary = np.expand_dims(resize_image(obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True), 0)
                #image_wrist = np.expand_dims(resize_image(obs['rgb_obs']['rgb_gripper'], (128, 128)), 0)
                #timestep_pad_mask = np.array([[True]])
                #pad_mask_dict = {
                #    "image_primary": np.array([[True]]),
                    #"image_wrist": np.array([[True]]),
                #    "timestep": np.array([[True]]),
                #}
                observation = {
                        "image_primary": np.expand_dims(image_primary, 0),  # uint8
                        #"image_wrist": np.expand_dims(image_wrist, 0),      # uint8
                        "timestep_pad_mask": timestep_pad_mask,
                        "pad_mask_dict": pad_mask_dict,
                        "timestep": np.array([[step-1, step]]),
                }
                act_buffer = policy_fn(observation, tasks)
                act_buffer = np.array(act_buffer[0])
                action = act_buffer[act_step]
            else:
                action = act_buffer[act_step]
            act_step += 1

        else:
            
            observation = {
                'full_image': resize_image(obs['rgb_obs']['rgb_static'], (224, 224))
            }

            action = get_action(
                cfg,
                model,
                observation,
                task_label=lang_annotation,
                processor=processor,
            )
        
        # Rescale actions from [0, 1] to [-1, 1] and binarize
        action = normalize_gripper_action(action)
        past_obs = obs
        obs, _, _, current_info = env.step(action)
        if record:
            # update video
            frame_aug = torch.zeros((3, 224, 448))
            resize = Resize(224, antialias=True)
            frame_aug[:, :, :224] = resize(torch.tensor(obs["rgb_obs"]["rgb_static"]).permute(2, 0, 1))
            closest_obs = 0
            if isinstance(closest_obs, int):
                closest_obs = torch.zeros((3, 224, 224))
            frame_aug[:, :, 224:] = closest_obs.squeeze()
            rollout_video.update(frame_aug.unsqueeze(0).unsqueeze(0))

        # check if current step solves a task
        current_task_info = task_oracle.get_task_info_for_set(start_info, current_info, {high_level_task})

        if len(current_task_info) > 0:
            if cfg.debug:
                print(colored("success", "green"), end=" ")
            if record:
                rollout_video.add_language_instruction(lang_annotation)
            return True
    if cfg.debug:
        print(colored("fail", "red"), end=" ")
    if record:
        rollout_video.add_language_instruction(lang_annotation)
    return False


In [19]:
calvin_cfg.ep_len = 200
calvin_cfg.num_sequences = 20
calvin_cfg.num_videos = 20

results, average_rate, success_rates, counters = evaluate_policy(model, env, calvin_cfg, 
                processor, 
                num_videos=calvin_cfg.num_videos, 
                save_dir=video_save_dir
                )


  0%|          | 0/20 [00:00<?, ?it/s]

Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the slider handle, then move the handle to the left, then ungrasp the slider handle


1/5 : 0.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.0 |:   5%|▌         | 1/20 [00:26<08:29, 26.80s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the blue block, then lift the grasped block


1/5 : 50.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.5 |:  10%|█         | 2/20 [00:30<03:53, 12.98s/it]

Evaluating sequence:  ['contact_pink_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  touch the pink block on its left side, then push the block towards the right


1/5 : 33.3% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.3 |:  15%|█▌        | 3/20 [00:48<04:26, 15.65s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the red block, then lift the grasped block


1/5 : 25.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  20%|██        | 4/20 [01:08<04:35, 17.20s/it]

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the red block, then lift the grasped block, then place the grasped block over the sliding cabinet, then ungrasp the block


1/5 : 20.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  25%|██▌       | 5/20 [01:23<04:04, 16.33s/it]

Evaluating sequence:  ['grasp_red_block', 'rotate_grasped_block_right', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the red block, then rotate the grasped block 90 degrees to the right, then ungrasp the block


1/5 : 16.7% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  30%|███       | 6/20 [01:37<03:40, 15.78s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the blue block, then lift the grasped block


1/5 : 14.3% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  35%|███▌      | 7/20 [01:52<03:19, 15.33s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the blue block, then lift the grasped block, then place the grasped block over the sliding cabinet, then ungrasp the block


1/5 : 12.5% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  40%|████      | 8/20 [02:09<03:10, 15.84s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block', 'place_grasped_block_over_slider', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_right', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the pink block, then lift the grasped block, then place the grasped block over the sliding cabinet, then ungrasp the block


1/5 : 11.1% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  45%|████▌     | 9/20 [02:26<03:00, 16.36s/it]

Evaluating sequence:  ['grasp_blue_block', 'rotate_grasped_block_left', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the blue block, then rotate the grasped block 90 degrees to the left, then ungrasp the block


1/5 : 10.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  50%|█████     | 10/20 [02:46<02:53, 17.31s/it]

Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the slider handle, then move the handle to the left, then ungrasp the slider handle


1/5 : 9.1% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  55%|█████▌    | 11/20 [03:02<02:32, 16.97s/it] 

Evaluating sequence:  ['grasp_red_block', 'lift_grasped_block', 'place_grasped_block_over_pink_block', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'closed', 'red_block': 'slider_left', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the red block, then lift the grasped block, then place the grasped block over the pink block, then ungrasp the block


1/5 : 8.3% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  60%|██████    | 12/20 [03:22<02:22, 17.85s/it]

Evaluating sequence:  ['grasp_blue_block', 'lift_grasped_block', 'place_grasped_block_over_drawer', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the blue block, then lift the grasped block, then place the grasped block over the drawer, then ungrasp the block


1/5 : 7.7% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  65%|██████▌   | 13/20 [03:35<01:55, 16.57s/it]

Evaluating sequence:  ['contact_blue_block_right', 'push_block_left']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  touch the blue block on its right side, then push the block towards the left


1/5 : 7.1% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  70%|███████   | 14/20 [03:50<01:35, 15.96s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the pink block, then lift the grasped block


1/5 : 13.3% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.1 |:  75%|███████▌  | 15/20 [03:53<01:00, 12.06s/it]

Evaluating sequence:  ['grasp_slider', 'move_slider_left', 'ungrasp_slider']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_right', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the slider handle, then move the handle to the left, then ungrasp the slider handle


1/5 : 18.8% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  80%|████████  | 16/20 [03:58<00:39,  9.90s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the pink block, then lift the grasped block


1/5 : 23.5% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  85%|████████▌ | 17/20 [04:12<00:33, 11.19s/it]

Evaluating sequence:  ['contact_red_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'table', 'blue_block': 'slider_left', 'pink_block': 'slider_right', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  touch the red block on its left side, then push the block towards the right


1/5 : 22.2% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  90%|█████████ | 18/20 [04:33<00:28, 14.10s/it]

Evaluating sequence:  ['grasp_pink_block', 'lift_grasped_block', 'place_grasped_block_over_drawer', 'ungrasp_block']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'table', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  grasp the pink block, then lift the grasped block, then place the grasped block over the drawer, then ungrasp the block


1/5 : 21.1% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |:  95%|█████████▌| 19/20 [04:53<00:15, 15.82s/it]

Evaluating sequence:  ['contact_blue_block_left', 'push_block_right']
Initial state:  {'led': 0, 'lightbulb': 0, 'slider': 'right', 'drawer': 'open', 'red_block': 'slider_right', 'blue_block': 'table', 'pink_block': 'slider_left', 'grasped': 0, 'contact': 0, 'red_block_lifted': 0, 'blue_block_lifted': 0, 'pink_block_lifted': 0}
The instruction  touch the blue block on its left side, then push the block towards the right


1/5 : 20.0% | 2/5 : 0.0% | 3/5 : 0.0% | 4/5 : 0.0% | 5/5 : 0.0% | Average: 0.2 |: 100%|██████████| 20/20 [05:13<00:00, 15.66s/it]


THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_0_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_1_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_2_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_3_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long_horizon_sequence_4_0.gif
THE SAVE LOCATION /ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/debug_videos/experiment_2_20250207_133938_small_2/_long

ValueError: not enough values to unpack (expected 4, got 3)

In [None]:

default_config_file = "/ubc/cs/research/nlp/grigorii/projects/octo_original/octo/scripts/configs/finetune_config.py"
config_flags.DEFINE_config_file(
    "config",
    default_config_file,
    "File path to the training hyperparameter configuration.",
    lock_config=False,
)


# create a 1D mesh with a single axis named "batch"
mesh = Mesh(jax.devices(), axis_names="batch")
# Our batches will be data-parallel sharded -- each device will get a slice of the batch
dp_sharding = NamedSharding(mesh, PartitionSpec("batch"))
# Our model will be replicated across devices (we are only doing data parallelism, not model parallelism)
replicated_sharding = NamedSharding(mesh, PartitionSpec())

# prevent tensorflow from using GPU memory since it's only used for data loading
tf.config.set_visible_devices([], "GPU")

from scripts.configs.finetune_config import get_config
flags_config = get_config()
flags_config.pretrained_path = "hf://rail-berkeley/octo-small-1.5"
pretrained_model = OctoModel.load_pretrained(
    flags_config.pretrained_path,
    step=flags_config.pretrained_step,
)

flat_config = flax.traverse_util.flatten_dict(
    pretrained_model.config, keep_empty_nodes=True
)
for d_key in flax.traverse_util.flatten_dict(
    flags_config.get("config_delete_keys", ConfigDict()).to_dict()
):
    for c_key in list(flat_config.keys()):
        if ".".join(c_key).startswith(".".join(d_key)):
            del flat_config[c_key]

config = ConfigDict(flax.traverse_util.unflatten_dict(flat_config))
config.update(flags_config.get("update_config", ConfigDict()))
config = config.to_dict()
check_config_diff(config, pretrained_model.config)
flags_config.dataset_kwargs.proprio_obs_key = "proprio"
if config["text_processor"] is None:
    text_processor = None
else:
    text_processor = ModuleSpec.instantiate(config["text_processor"])()

def process_batch(batch):
    batch = process_text(batch, text_processor)
    del batch["dataset_name"]
    return batch

dataset = make_single_dataset(
    flags_config.dataset_kwargs,
    traj_transform_kwargs=flags_config.traj_transform_kwargs,
    frame_transform_kwargs=flags_config.frame_transform_kwargs,
    train=True,
)

train_data_iter = (
    dataset.repeat()
    .unbatch()
    .shuffle(flags_config.shuffle_buffer_size)
    .batch(flags_config.batch_size)
    .iterator()
)

train_data_iter = map(process_batch, train_data_iter)
example_batch = next(train_data_iter)

#########
#
# Load Pretrained Model
#
#########

rng = jax.random.PRNGKey(flags_config.seed)
rng, init_rng = jax.random.split(rng)
model = OctoModel.from_config(
    config,
    example_batch,
    text_processor,
    rng=init_rng,
    dataset_statistics=dataset.dataset_statistics,
)
merged_params = merge_params(model.params, pretrained_model.params)
model = model.replace(params=merged_params)
del pretrained_model

from experiments.robot.calvin.calvin_utils import get_calvin_env, add_text

env, calvin_cfg, lang_embeddings = get_calvin_env(
    "/ubc/cs/research/nlp/grigorii/projects/openvla/experiments/robot/calvin/conf/med_tasks_config.yaml",
    device_id=0,
)

import jax.numpy as jnp

import pyhash
hasher = pyhash.fnv1_32()
import numpy as  np
import torch
from torchvision.transforms import Resize


def normalize_gripper_action(action, binarize=True):
    """
    Changes gripper action (last dimension of action vector) from [0,1] to [-1,+1].
    Necessary for some environments (not Bridge) because the dataset wrapper standardizes gripper actions to [0,1].
    Note that unlike the other action dimensions, the gripper action is not normalized to [-1,+1] by default by
    the dataset wrapper.

    Normalization formula: y = 2 * (x - orig_low) / (orig_high - orig_low) - 1
    """
    # Just normalize the last action to [-1,+1].
    orig_low, orig_high = 0.0, 1.0
    action[..., -1] = 2 * (action[..., -1] - orig_low) / (orig_high - orig_low) - 1

    if binarize:
        # Binarize to -1 or +1.
        action[..., -1] = np.sign(action[..., -1])

    return action


def decode_actions(normalized_actions, model, unnorm_key, from_tokens=False):
    unnormalization_statistics = model.dataset_statistics["action"]
    mask = unnormalization_statistics.get(
                    "mask",
                    jnp.ones_like(unnormalization_statistics["mean"], dtype=bool),
                )
    action = normalized_actions[..., : len(mask)]
    action = jnp.where(
        mask,
        (action * unnormalization_statistics["std"])
        + unnormalization_statistics["mean"],
        action,
    )

    return action


def resize_image(img, resize_size):
    """
    Takes numpy array corresponding to a single image and returns resized image as numpy array.

    NOTE (Moo Jin): To make input images in distribution with respect to the inputs seen at training time, we follow
                    the same resizing scheme used in the Octo dataloader, which OpenVLA uses for training.
    """
    assert isinstance(resize_size, tuple)
    # Resize to image size expected by model
    img = tf.image.encode_jpeg(img)  # Encode as JPEG, as done in RLDS dataset builder
    img = tf.io.decode_image(img, expand_animations=False, dtype=tf.uint8)  # Immediately decode back
    img = tf.image.resize(img, resize_size, method="lanczos3", antialias=True)
    img = tf.cast(tf.clip_by_value(tf.round(img), 0, 255), tf.uint8)
    img = img.numpy()
    return img


def join_vis_lang(img, lang_text):
    """Takes as input an image and a language instruction and visualizes them with cv2"""
    img = img[:, :, ::-1].copy()
    img = cv2.resize(img, (500, 500))
    add_text(img, lang_text)
    cv2.imshow("simulation cam", img)
    cv2.waitKey(1)


def get_video_tag(i):
    return f"_long_horizon/sequence_{i}"


def get_env_state_for_initial_condition(initial_condition):
    robot_obs = np.array(
        [
            0.02586889,
            -0.2313129,
            0.5712808,
            3.09045411,
            -0.02908596,
            1.50013585,
            0.07999963,
            -1.21779124,
            1.03987629,
            2.11978254,
            -2.34205014,
            -0.87015899,
            1.64119093,
            0.55344928,
            1.0,
        ]
    )
    block_rot_z_range = (np.pi / 2 - np.pi / 8, np.pi / 2 + np.pi / 8)
    block_slider_left = np.array([-2.40851662e-01, 9.24044687e-02, 4.60990009e-01])
    block_slider_right = np.array([7.03416330e-02, 9.24044687e-02, 4.60990009e-01])
    block_table = [
        np.array([5.00000896e-02, -1.20000177e-01, 4.59990009e-01]),
        np.array([2.29995412e-01, -1.19995140e-01, 4.59990010e-01]),
    ]
    # we want to have a "deterministic" random seed for each initial condition
    # TODO: Figure this out
    #seed = hash(str(initial_condition.values()))
    seed = hasher(str(initial_condition.values()))
    #print("the seed? ", seed)
    with temp_seed(seed):
        np.random.shuffle(block_table)

        scene_obs = np.zeros(24)
        if initial_condition["slider"] == "left":
            scene_obs[0] = 0.28
        if initial_condition["drawer"] == "open":
            scene_obs[1] = 0.22
        if initial_condition["lightbulb"] == 1:
            scene_obs[3] = 0.088
        scene_obs[4] = initial_condition["lightbulb"]
        scene_obs[5] = initial_condition["led"]
        # red block
        if initial_condition["red_block"] == "slider_right":
            scene_obs[6:9] = block_slider_right
        elif initial_condition["red_block"] == "slider_left":
            scene_obs[6:9] = block_slider_left
        else:
            scene_obs[6:9] = block_table[0]
        scene_obs[11] = np.random.uniform(*block_rot_z_range)
        # blue block
        if initial_condition["blue_block"] == "slider_right":
            scene_obs[12:15] = block_slider_right
        elif initial_condition["blue_block"] == "slider_left":
            scene_obs[12:15] = block_slider_left
        elif initial_condition["red_block"] == "table":
            scene_obs[12:15] = block_table[1]
        else:
            scene_obs[12:15] = block_table[0]
        scene_obs[17] = np.random.uniform(*block_rot_z_range)
        # pink block
        if initial_condition["pink_block"] == "slider_right":
            scene_obs[18:21] = block_slider_right
        elif initial_condition["pink_block"] == "slider_left":
            scene_obs[18:21] = block_slider_left
        else:
            scene_obs[18:21] = block_table[1]
        scene_obs[23] = np.random.uniform(*block_rot_z_range)

    return robot_obs, scene_obs


def count_success(results):
    count = Counter(results)
    step_success = []
    for i in range(1, 6):
        n_success = sum(count[j] for j in reversed(range(i, 6)))
        sr = n_success / len(results)
        step_success.append(sr)
    return step_success


def evaluate_policy(model, env, traj_dataset, batch_transform, lang_embeddings, cfg, processor, num_videos=0, checkp_path=None, save_dir=None):
    task_oracle = hydra.utils.instantiate(cfg.tasks)
    val_annotations = cfg.annotations
    # video stuff
    if num_videos > 0:
        rollout_video = RolloutVideo(
            logger=logger,
            empty_cache=False,
            log_to_file=True,
            save_dir=cfg.video_save_dir,
            resolution_scale=1,
        )
    else:
        rollout_video = None

    evaluate_trajectories(
        env, model, traj_dataset, batch_transform, task_oracle, val_annotations, cfg, processor, rollout_video, num_videos
    )

    if num_videos > 0:
        # log rollout videos
        rollout_video._log_videos_to_file(0, save_as_video=False)
    return results, average_rate, plans

def transform_traj(traj):
    new_traj = []
    for tstep in range(traj['observation']['image_primary'].shape[0]):
        new_step = {}
        new_step['observation'] = {}
        new_step['observation']['image_primary'] = traj['observation']['image_primary'][tstep]
        new_step['observation']['image_wrist'] = traj['observation']['image_wrist'][tstep]
        new_step['observation']['timestep'] = traj['observation']['timestep'][tstep]
        new_step['observation']['proprio'] = traj['observation']['proprio'][tstep]

        new_step['observation']['pad_mask_dict'] = {}
        new_step['observation']['pad_mask_dict']['image_primary'] = traj['observation']['pad_mask_dict']['image_primary'][tstep]
        new_step['observation']['pad_mask_dict']['image_wrist'] = traj['observation']['pad_mask_dict']['image_wrist'][tstep]
        new_step['observation']['pad_mask_dict']['timestep'] = traj['observation']['pad_mask_dict']['timestep'][tstep]

        
        new_step['observation']['timestep_pad_mask'] = traj['observation']['timestep_pad_mask'][tstep]
        new_step['observation']['task_completed'] = traj['observation']['task_completed'][tstep]

        new_step['task'] = {}
        new_step['task']['language_instruction'] = traj['task']['language_instruction'][tstep]

        new_step['task']['pad_mask_dict'] = {}
        new_step['task']['pad_mask_dict']['language_instruction'] = traj['task']['pad_mask_dict']['language_instruction'][tstep]

        new_step['action'] = traj['action'][tstep]
        new_step['dataset_name'] = traj['dataset_name'][tstep]
        new_step['action_pad_mask'] = traj['action_pad_mask'][tstep]
        new_traj.append(new_step)
    return new_traj


def evaluate_trajectories(
    env, model, traj_dataset, batch_transform, task_checker, val_annotations, cfg, processor, rollout_video, num_videos
):

    for i, traj in enumerate(traj_dataset):
        if i > num_videos:
            break
        traj = transform_traj(traj)
        annotation = traj[0]['task']['language_instruction']
        annotation = annotation.numpy().decode("utf-8")
        rollout_video.new_video(tag=get_video_tag(i), caption=annotation)
        rollout_video.new_subtask()

        proprio = traj[0]['observation']['proprio']
        
        robot_obs, scene_obs = np.split(proprio.numpy()[0], indices_or_sections=[15])
        env.reset(robot_obs=robot_obs, scene_obs=scene_obs)

        rollout(env, traj, batch_transform, model, task_checker, cfg, annotation, processor=processor, record=True, rollout_video=rollout_video)

        rollout_video.write_to_tmp()


def rollout(env, traj, batch_transform, model, task_oracle, cfg, lang_annotation, processor, record=False, rollout_video=None):
    if cfg.debug:
        print(f"{subtask} ", end="")
        time.sleep(0.5)

    obs = env.get_obs()


    start_info = env.get_info()

    for step in range(len(traj)):
        print("Step: ", step)
        traj_step = traj[step]
        norm_action = traj_step['action'].numpy()
        unnorm_action = decode_actions(norm_action, model, unnorm_key="medium_level_tasks_dataset")
        unnorm_action = np.array(unnorm_action)
        # Rescale actions from [0, 1] to [-1, 1] and binarize
        action = normalize_gripper_action(unnorm_action)
        action = action[0][0]
        obs, _, _, current_info = env.step(action)

        if cfg.debug:
            img = env.render(mode="rgb_array")
            join_vis_lang(img, lang_annotation)
            # time.sleep(0.1)
        if record:
            # update video
            frame_aug = torch.zeros((3, 224, 448))
            resize_transform = Resize(224, antialias=True)

            frame_aug[:, :, :224] = resize_transform(torch.tensor(obs["rgb_obs"]["rgb_static"]).permute(2, 0, 1))
            closest_obs = 0
            if isinstance(closest_obs, int):
                closest_obs = torch.zeros((3, 224, 224))
            frame_aug[:, :, 224:] = closest_obs.squeeze()

            rollout_video.update(frame_aug.unsqueeze(0).unsqueeze(0))
        """
        # check if current step solves a task
        current_task_info = task_oracle.get_task_info_for_set(start_info, current_info, {subtask})
        if len(current_task_info) > 0:
            if cfg.debug:
                print(colored("success", "green"), end=" ")
            if record:
                rollout_video.add_language_instruction(lang_annotation)
            return True
        """
    if cfg.debug:
        print(colored("fail", "red"), end=" ")
    if record:
        rollout_video.add_language_instruction(lang_annotation)
    return False

import hydra 
from experiments.robot.calvin.rollout_video import RolloutVideo
import logging
logger = logging.getLogger(__name__)

calvin_cfg.video_save_dir = "/ubc/cs/research/nlp/grigorii/projects/octo_original/octo/experiments/replay_video_saves"


_, avg_rate, _ = evaluate_policy(model=model, env=env, traj_dataset=dataset, batch_transform=process_batch, lang_embeddings=None, cfg=calvin_cfg, 
                processor=None, num_videos=50)



def evaluate_policy(model, env, lang_embeddings, cfg, processor, num_videos=0, save_dir=None):
    task_oracle = hydra.utils.instantiate(cfg.tasks)
    val_annotations = cfg.annotations
    # video stuff
    if num_videos > 0:
        rollout_video = RolloutVideo(
            logger=logger,
            empty_cache=False,
            log_to_file=True,
            save_dir=save_dir,
            resolution_scale=1,
        )
    else:
        rollout_video = None

    eval_sequences = get_sequences(cfg.num_sequences)
    results = []
    plans = defaultdict(list)
    print(eval_sequences)

    if not cfg.debug:
        eval_sequences = tqdm(eval_sequences, position=0, leave=True)
    for i, (initial_state, eval_sequence) in enumerate(eval_sequences):
        record = i < num_videos
        result = evaluate_sequence(
            env, model, task_oracle, initial_state, eval_sequence, lang_embeddings, val_annotations, cfg, processor, record, rollout_video, i
        )

        results.append(result)
        if record:
            rollout_video.write_to_tmp()
        if not cfg.debug:
            success_rates = count_success(results)
            average_rate = sum(success_rates) / len(success_rates) * 5
            description = " ".join([f"{i + 1}/5 : {v * 100:.1f}% |" for i, v in enumerate(success_rates)])
            description += f" Average: {average_rate:.1f} |"
            eval_sequences.set_description(description)

    if num_videos > 0:
        # log rollout videos
        rollout_video._log_videos_to_file(0, save_as_video=False)
    return results, average_rate, plans


def evaluate_sequence(
    env, model, task_checker, initial_state, eval_sequence, lang_embeddings, val_annotations, cfg, processor, record, rollout_video, i
):
    robot_obs, scene_obs = get_env_state_for_initial_condition(initial_state)
    env.reset(robot_obs=robot_obs, scene_obs=scene_obs)
    if record:
        caption = " | ".join(eval_sequence)
        rollout_video.new_video(tag=get_video_tag(i), caption=caption)
    success_counter = 0
    if cfg.debug:
        time.sleep(1)
        print()
        print()
        print(f"Evaluating sequence: {' -> '.join(eval_sequence)}")
        print("Subtask: ", end="")
    for subtask in eval_sequence:
        print(eval_sequence)
        if record:
            rollout_video.new_subtask()
        success = rollout(env, model, task_checker, cfg, subtask, lang_embeddings, val_annotations, processor=processor, record=record, rollout_video=rollout_video)
        if record:
            rollout_video.draw_outcome(success)
        if success:
            success_counter += 1
        else:
            return success_counter
    return success_counter

def rollout(env, model, task_oracle, cfg, subtask, lang_embeddings, val_annotations, processor, record=False, rollout_video=None):
    if cfg.debug:
        print(f"{subtask} ", end="")
        time.sleep(0.5)
    obs = env.get_obs()
    # get lang annotation for subtask
    lang_annotation = val_annotations[subtask][0]
    # get language goal embedding
    goal = lang_embeddings.get_lang_goal(lang_annotation)
    goal['lang_text'] = lang_annotation
    if processor == None:
        tasks = model.create_tasks(texts=[lang_annotation])
        from octo.utils.train_callbacks import supply_rng
        policy_fn = supply_rng(
                partial(
                    model.sample_actions,
                    unnormalization_statistics=model.dataset_statistics["action"],
                ),
            )
        window_size = 4
        act_step = 4

    #model.reset()
    start_info = env.get_info()
    first_step = True
    past_obs = None


    for step in range(cfg.ep_len):
        if processor == None:
            if act_step > 0 and act_step % window_size == 0:
                act_step = 0
                """
                static_2 = resize_image(obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True)
                gripper_2 = resize_image(obs['rgb_obs']['rgb_gripper'], (128, 128))

                if past_obs:
                    static_1 = resize_image(past_obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True)
                    gripper_1 = resize_image(past_obs['rgb_obs']['rgb_gripper'], (128, 128))
                    image_primary = np.stack([static_1, static_2])
                    image_wrist = np.stack([gripper_1, gripper_2])
                    timestep_pad_mask = np.array([[False, True]])
                else:
                    image_primary = np.stack([np.zeros((256, 256, 3)), static_2])
                    image_wrist = np.stack([np.zeros((128, 128, 3)), gripper_2])
                    timestep_pad_mask = np.array([[False, True]])
                pad_mask_dict = {
                    "image_primary": np.array([[True, True]]),
                    "image_wrist": np.array([[True, True]]),
                    "timestep": np.array([[False, False]]),
                }
                """
                image_primary = np.expand_dims(resize_image(obs['rgb_obs']['rgb_static'], (256, 256), primary_octo=True), 0)
                image_wrist = np.expand_dims(resize_image(obs['rgb_obs']['rgb_gripper'], (128, 128)), 0)
                timestep_pad_mask = np.array([[True]])
                pad_mask_dict = {
                    "image_primary": np.array([[True]]),
                    "image_wrist": np.array([[True]]),
                    "timestep": np.array([[True]]),
                }
                observation = {
                        "image_primary": np.expand_dims(image_primary, 0),  # uint8
                        "image_wrist": np.expand_dims(image_wrist, 0),      # uint8
                        "timestep_pad_mask": timestep_pad_mask,
                        "pad_mask_dict": pad_mask_dict,
                        "timestep": np.array([[step]]),
                }
                act_buffer = policy_fn(observation, tasks)
                # Perform simple receding-horizon control (select the first action)
                act_buffer = np.array(act_buffer[0])
                action = act_buffer[act_step]
            else:
                action = act_buffer[act_step]
            act_step += 1

        else:
            observation = {
                'full_image': resize_image(obs['rgb_obs']['rgb_static'], (224, 224))
            }
            action = get_action(
                cfg,
                model,
                observation,
                task_label=lang_annotation,
                processor=processor,
            )
        
        # Rescale actions from [0, 1] to [-1, 1] and binarize
        action = normalize_gripper_action(action)
        past_obs = obs
        obs, _, _, current_info = env.step(action)
        if record:
            # update video
            frame_aug = torch.zeros((3, 224, 448))
            resize = Resize(224, antialias=True)
            frame_aug[:, :, :224] = resize(torch.tensor(obs["rgb_obs"]["rgb_static"]).permute(2, 0, 1))
            closest_obs = 0
            if isinstance(closest_obs, int):
                closest_obs = torch.zeros((3, 224, 224))
            frame_aug[:, :, 224:] = closest_obs.squeeze()

            rollout_video.update(frame_aug.unsqueeze(0).unsqueeze(0))
        # check if current step solves a task
        current_task_info = task_oracle.get_task_info_for_set(start_info, current_info, {subtask})
        if len(current_task_info) > 0:
            if cfg.debug:
                print(colored("success", "green"), end=" ")
            if record:
                rollout_video.add_language_instruction(lang_annotation)
            return True
    if cfg.debug:
        print(colored("fail", "red"), end=" ")
    if record:
        rollout_video.add_language_instruction(lang_annotation)
    return False

### Parsing the results

In [None]:
import os
import pickle
from collections import defaultdict
import numpy as np

ll_scores_path = '/ubc/cs/research/nlp/grigorii/projects/openvla/low_level_scores_sockeye/'

onebyone_res = {
    'avg_rates': defaultdict(list),
    'high_level_started': defaultdict(list),
    'high_level_completed': defaultdict(list),
    'low_level_started': defaultdict(list),
    'low_level_completed': defaultdict(list)
}
conjunction_res = {
    'avg_rates': defaultdict(list),
    'high_level_started': defaultdict(list),
    'high_level_completed': defaultdict(list)
}
random_res = {
    'avg_rates': defaultdict(list),
    'low_level_started': defaultdict(list),
    'low_level_completed': defaultdict(list),
    'chain_results': defaultdict(list)
}

medium_res = {
    'avg_rates': defaultdict(list),
    'chain_results': defaultdict(list)
}

medium_single_res = {
    'avg_rates': defaultdict(list),
    'low_level_started': defaultdict(list),
    'low_level_completed': defaultdict(list),
}


def get_model_size_key(res_path):
    if 'small' in res_path:
        return 'small'
    elif 'resnet' in res_path:
        return 'resnet'
    elif 'voltron' in res_path:
        return 'voltron'
    elif 'base' in res_path:
        return 'base'
    else:
        raise Exception("Unknown model")

for res_dict_name in os.listdir(ll_scores_path):
    if "_conj_" in res_dict_name:
        continue
    with open(ll_scores_path + res_dict_name, 'rb') as res_dict_file:
        res_dict = pickle.load(res_dict_file)
        if 'low_onebyone' in res_dict_name:
            results = res_dict[list(res_dict.keys())[0]]
            print(results)
            succ_rate, _, counters  = results
            res_key = get_model_size_key(res_dict_name)
            onebyone_res['avg_rates'][res_key].append(succ_rate)
            onebyone_res['high_level_started'][res_key].append(counters['high_level_started'])
            onebyone_res['high_level_completed'][res_key].append(counters['high_level_completed'])    
            onebyone_res['low_level_started'][res_key].append(counters['low_level_started'])
            onebyone_res['low_level_completed'][res_key].append(counters['low_level_completed'])    
        elif 'conjunction' in res_dict_name:
            results = res_dict[list(res_dict.keys())[0]]
            succ_rate, _, counters  = results
            res_key = get_model_size_key(res_dict_name)
            conjunction_res['avg_rates'][res_key].append(succ_rate)
            conjunction_res['high_level_started'][res_key].append(counters['high_level_started'])
            conjunction_res['high_level_completed'][res_key].append(counters['high_level_completed'])     
        elif 'low_random' in res_dict_name:
            results = res_dict[list(res_dict.keys())[0]]
            succ_rate, chain, counters  = results
            res_key = get_model_size_key(res_dict_name)
            random_res['avg_rates'][res_key].append(succ_rate)
            random_res['chain_results'][res_key].append(chain)
            random_res['low_level_started'][res_key].append(counters['low_level_started'])
            random_res['low_level_completed'][res_key].append(counters['low_level_completed'])    
        elif 'medium_random' in res_dict_name:
            results = res_dict[list(res_dict.keys())[0]]
            succ_rate, chain, counters  = results
            res_key = get_model_size_key(res_dict_name)
            medium_res['avg_rates'][res_key].append(succ_rate)
            medium_res['chain_results'][res_key].append(chain)
            medium_res['low_level_started'][res_key].append(counters['low_level_started'])
            medium_res['low_level_completed'][res_key].append(counters['low_level_completed'])   
        elif 'medium_single' in res_dict_name:
            results = res_dict[list(res_dict.keys())[0]]
            succ_rate, chain, counters  = results
            res_key = get_model_size_key(res_dict_name)
            medium_single_res['avg_rates'][res_key].append(succ_rate)
            medium_single_res['high_level_started'][res_key].append(counters['high_level_started'])
            medium_single_res['high_level_completed'][res_key].append(counters['high_level_completed'])   
        else:
            raise Exception('Unknown checkpoint type')


high_task_categories = {
    "rotate_red_block_right": "rotate",
    "rotate_red_block_left": "rotate",
    "rotate_blue_block_right": "rotate",
    "rotate_blue_block_left": "rotate",
    "rotate_pink_block_right": "rotate",
    "rotate_pink_block_left": "rotate",
    "push_red_block_right": "push",
    "push_red_block_left": "push",
    "push_blue_block_right": "push",
    "push_blue_block_left": "push",
    "push_pink_block_right": "push",
    "push_pink_block_left": "push",
    "move_slider_left": "slider/drawer",
    "move_slider_right": "slider/drawer",
    "open_drawer": "slider/drawer",
    "close_drawer": "slider/drawer",
    "lift_red_block_table": "lift",
    "lift_red_block_slider": "lift",
    "lift_red_block_drawer": "lift",
    "lift_blue_block_table": "lift",
    "lift_blue_block_slider": "lift",
    "lift_blue_block_drawer": "lift",
    "lift_pink_block_table": "lift",
    "lift_pink_block_slider": "lift",
    "lift_pink_block_drawer": "lift",
    "place_in_slider": "place",
    "place_red_block_in_slider": "place",
    "place_blue_block_in_slider": "place",
    "place_pink_block_in_slider": "place",
    "place_in_drawer": "place",
    "place_red_block_in_drawer": "place",
    "place_blue_block_in_drawer": "place",
    "place_pink_block_in_drawer": "place",
    "push_into_drawer": "push",
    "stack_block": "stacking",
}


low_task_categories = {
    "grasp_red_block": 'grasp_block',
    "grasp_blue_block": 'grasp_block',
    "grasp_pink_block": 'grasp_block',
    "grasp_slider": 'grasp_door',
    "grasp_drawer": 'grasp_door',
    "ungrasp_block": 'ungrasp',
    "ungrasp_slider": 'ungrasp',
    "ungrasp_drawer": 'ungrasp',
    "contact_red_block_left": 'contact',
    "contact_blue_block_left": 'contact',
    "contact_pink_block_left": 'contact',
    "contact_red_block_right":'contact',
    "contact_blue_block_right": 'contact',
    "contact_pink_block_right": 'contact',
    "place_grasped_block_over_red_block": 'place',
    "place_grasped_block_over_blue_block": 'place',
    "place_grasped_block_over_pink_block": 'place',
    "push_block_right": 'push',
    "push_block_left": 'push',
    "move_slider_left": 'drawer',
    "move_slider_right": 'drawer',
    "open_drawer": 'drawer',
    "close_drawer": 'drawer',
    "push_into_drawer": 'push',
    "lift_grasped_block": 'lift',
    "rotate_grasped_block_right": 'rotate',
    "rotate_grasped_block_left": 'rotate',
    "place_grasped_block_over_drawer": 'place',
    "place_grasped_block_over_slider": 'place',
    "place_grasped_block_over_table": 'place',
}
ordered_high_categories = ['slider/drawer', 'push', 'lift', 'rotate', 'place', 'stacking']
ordered_low_categories = ['grasp_door', 'drawer', 'grasp_block', 'contact', 'ungrasp', 'lift', 'push', 'rotate', 'place']

Exception: Unknown checkpoint type

In [13]:
onebyone_res['low_level_started']

defaultdict(list,
            {'voltron': [Counter({'lift_grasped_block': 142,
                       'grasp_red_block': 110,
                       'grasp_pink_block': 93,
                       'grasp_blue_block': 92,
                       'grasp_slider': 50,
                       'grasp_drawer': 49,
                       'ungrasp_block': 45,
                       'close_drawer': 36,
                       'rotate_grasped_block_left': 30,
                       'rotate_grasped_block_right': 27,
                       'contact_blue_block_left': 23,
                       'contact_red_block_left': 22,
                       'contact_pink_block_right': 17,
                       'contact_red_block_right': 15,
                       'contact_pink_block_left': 15,
                       'push_block_right': 14,
                       'contact_blue_block_right': 14,
                       'open_drawer': 13,
                       'move_slider_left': 11,
                       'push_bloc

In [14]:
print(set(low_task_categories.values()))

{'ungrasp', 'place', 'grasp_block', 'push', 'grasp_door', 'rotate', 'contact', 'lift', 'drawer'}


In [51]:
def parse_results(models, 
                  avg_rates, 
                  chain_results=None,
                  low_level_started=None, 
                  low_level_completed=None, 
                  high_level_started=None, 
                  high_level_completed=None, 
                  high_categories=None, 
                  low_categories=None,
                  ordered_high=None,
                  ordered_low=None):
    
    for model in models:
        mean, sd = np.mean(avg_rates[model]), np.std(avg_rates[model])
        print(model)
        total_mean_sd = '${}'.format(round(mean, 2)) + " \pm " + '{}$'.format(round(sd, 2))
        
        if chain_results is not None:
            chain_all = chain_results[model]
            chain_means = np.mean(chain_all, axis=0)
            chain_sd = np.std(chain_all, axis=0)
            chain_mean_sd = ["${} \pm {}$".format(round(chain_means[i], 2), round(chain_sd[i], 2)) for i in range(5)]
            chain_mean_sd_str = " & " + " & ".join(chain_mean_sd) + " & " + total_mean_sd + " \\\\"
            print("Chain values:")
            print(chain_mean_sd_str)


        high_started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_categories.values())}
        high_completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_categories.values())}
        low_started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(low_categories.values())}
        low_completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(low_categories.values())}

        high_mean_sd_per_category = {}
        low_mean_sd_per_category = {}

        # For each task in each category, get the number of times it was started and completed.
        for i in range(3):
            if high_level_started != None:
                for task in high_categories.keys():
                    high_started_counter = high_level_started[model][i]
                    if task in high_started_counter:
                        high_started_per_category[high_categories[task]][i] += high_started_counter[task]
                    high_finished_counter = high_level_completed[model][i]
                    high_completed_per_category[high_categories[task]][i] += high_finished_counter[task]

            if low_level_started != None:
                for task in low_categories.keys():
                    low_started_counter = low_level_started[model][i]
                    if task in low_started_counter:
                        low_started_per_category[low_categories[task]][i] += low_started_counter[task]

                    low_finished_counter = low_level_completed[model][i]
                    low_completed_per_category[low_categories[task]][i] += low_finished_counter[task]

        if high_level_started != None:
            avg_per_category = {}
            high_mean_across_categories = np.array([0., 0., 0.])

            for task_category in ordered_high:
                avg_per_category[task_category] = high_completed_per_category[task_category] / high_started_per_category[task_category] 
                high_mean_across_categories += avg_per_category[task_category]
                high_mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(avg_per_category[task_category]), 2), 
                                                                        round(np.std(avg_per_category[task_category]), 2))
            high_mean_across_categories /= len(ordered_high)
            high_mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(high_mean_across_categories), 2), round(np.std(high_mean_across_categories), 2))
            print("High:")
            print(" & " + " & ".join([high_mean_sd_per_category[category] for category in high_mean_sd_per_category.keys()]) + '\\')

        if low_level_started != None:

            avg_per_category = {}
            low_mean_across_categories =  np.array([0., 0., 0.])
            for task_category in ordered_low:
                print(task_category, low_completed_per_category[task_category], low_started_per_category[task_category] )
                avg_per_category[task_category] = low_completed_per_category[task_category] / low_started_per_category[task_category] 
                low_mean_across_categories += avg_per_category[task_category]
                low_mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(avg_per_category[task_category]), 2), 
                                                                        round(np.std(avg_per_category[task_category]), 2))
            low_mean_across_categories /= len(ordered_low)
            low_mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(low_mean_across_categories), 2), round(np.std(low_mean_across_categories), 2))
            print("Low:")
            print(" & " + " & ".join([low_mean_sd_per_category[category] for category in low_mean_sd_per_category.keys()]) + '\\')




In [None]:
print()
print()
print("====== ONE BY ONE ======")
parse_results(
    models=["resnet", "voltron"],
    avg_rates=onebyone_res['avg_rates'],
    low_categories=low_task_categories,
    high_categories=high_task_categories,
    low_level_started=onebyone_res['low_level_started'],
    low_level_completed=onebyone_res['low_level_completed'],
    high_level_started=onebyone_res['high_level_started'],
    high_level_completed=onebyone_res['high_level_completed'],
    ordered_high=ordered_high_categories,
    ordered_low=ordered_low_categories
)
print()
print()
print("====== CONJUNCTION ======")
parse_results(
    models=["resnet", "voltron"],
    avg_rates=conjunction_res['avg_rates'],
    high_level_started=conjunction_res['high_level_started'],
    high_level_completed=conjunction_res['high_level_completed'],
    low_categories=low_task_categories,
    high_categories=high_task_categories,
    ordered_high=ordered_high_categories,
    ordered_low=ordered_low_categories
)

print()
print()
print("====== RANDOM ======")
parse_results(
    models=["resnet", "voltron"],
    avg_rates=random_res['avg_rates'],
    chain_results=random_res['chain_results'],
    low_categories=low_task_categories,
    high_categories=high_task_categories,
    low_level_started=random_res['low_level_started'],
    low_level_completed=random_res['low_level_completed'],
    ordered_high=ordered_high_categories,
    ordered_low=ordered_low_categories
)

print()
print()
print("====== MEDIUM CHAIN ======")
parse_results(
    models=["resnet", "voltron"],
    avg_rates=medium_res['avg_rates'],
    chain_results=medium_res['chain_results'],
)

print()
print()
print("====== MEDIUM SINGLE ======")
parse_results(
    models=["resnet", "voltron"],
    avg_rates=medium_single_res['avg_rates'],
    low_categories=high_task_categories,
    low_level_started=medium_single_res['low_level_started'],
    low_level_completed=medium_single_res['low_level_completed'],
    ordered_low=ordered_high_categories
)





resnet
High:
 & $0.07 \pm 0.04$ & $0.02 \pm 0.0$ & $0.0 \pm 0.0$ & $0.49 \pm 0.07$ & $0.01 \pm 0.01$ & $0.0 \pm 0.0$ & $0.1 \pm 0.02$\
grasp_door [50 59 49] [99 99 99]
drawer [10 19  1] [50 59 49]
grasp_block [186 171 175] [295 295 295]
contact [2 3 4] [106 106 106]
ungrasp [42 46 25] [60 64 42]
lift [2 0 1] [133 118 122]
push [2 3 2] [2 3 4]
rotate [49 45 41] [53 53 53]
place [1 0 0] [1 0 0]
Low:
 & $0.53 \pm 0.05$ & $0.18 \pm 0.12$ & $0.6 \pm 0.02$ & $0.03 \pm 0.01$ & $0.67 \pm 0.05$ & $0.01 \pm 0.01$ & $0.83 \pm 0.24$ & $0.85 \pm 0.06$ & $nan \pm nan$ & $nan \pm nan$\
voltron
High:
 & $0.16 \pm 0.15$ & $0.08 \pm 0.06$ & $0.11 \pm 0.08$ & $0.33 \pm 0.07$ & $0.06 \pm 0.05$ & $0.0 \pm 0.0$ & $0.12 \pm 0.02$\
grasp_door [60 60 88] [99 99 99]
drawer [ 9 11 80] [60 60 88]
grasp_block [199 193 206] [295 295 295]
contact [24  9 27] [106 106 106]
ungrasp [36 25 58] [ 54  56 113]
lift [20 47  8] [142 138 163]
push [ 6  2 18] [24  9 27]
rotate [42 38 32] [57 55 43]
place [3 7 1] [10 17  3]
L

  avg_per_category[task_category] = low_completed_per_category[task_category] / low_started_per_category[task_category]


#### One-by-one results

In [None]:


#for model in ['small', 'base']:
for model in ['resnet', 'voltron']:
    mean, sd = np.mean(onebyone_res['avg_rates'][model]), np.std(onebyone_res['avg_rates'][model])
    print(model, '${:.2}'.format(mean), "\pm", '{:.1}$'.format(sd))
    
    high_started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_task_categories.values())}
    high_completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_task_categories.values())}
    low_started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(low_task_categories.values())}
    low_completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(low_task_categories.values())}

    high_mean_sd_per_category = {}
    low_mean_sd_per_category = {}

    # 1) Get the counts for each category, within a seed.
    for i in range(3):
        for task in high_task_categories.keys():
            high_started_counter = onebyone_res['high_level_started'][model][i]
            if task in high_started_counter:
                high_started_per_category[high_task_categories[task]][i] += high_started_counter[task]
            high_finished_counter = onebyone_res['high_level_completed'][model][i]
            high_completed_per_category[high_task_categories[task]][i] += high_finished_counter[task]

        for task in low_task_categories.keys():
            low_started_counter = onebyone_res['low_level_started'][model][i]
            if task in low_started_counter:
                low_started_per_category[low_task_categories[task]][i] += low_started_counter[task]

            low_finished_counter = onebyone_res['low_level_completed'][model][i]
            low_completed_per_category[low_task_categories[task]][i] += low_finished_counter[task]


    high_categories = ['slider/drawer', 'push', 'lift', 'rotate', 'place', 'stacking']
    avg_per_category = {}
    high_mean_across_categories = np.array([0., 0., 0.])

    for task_category in high_categories:
        avg_per_category[task_category] = high_completed_per_category[task_category] / high_started_per_category[task_category] 
        high_mean_across_categories += avg_per_category[task_category]
        high_mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(avg_per_category[task_category]), 2), 
                                                                   round(np.std(avg_per_category[task_category]), 2))
    high_mean_across_categories /= len(high_categories)
    high_mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(high_mean_across_categories), 2), round(np.std(high_mean_across_categories), 2))
    print("High:")
    print(" & " + " & ".join([high_mean_sd_per_category[category] for category in high_mean_sd_per_category.keys()]) + '\\')

    low_categories = ['grasp_door', 'drawer', 'grasp_block', 'contact', 'ungrasp', 'lift', 'push', 'rotate', 'place']
    avg_per_category = {}
    low_mean_across_categories =  np.array([0., 0., 0.])
    for task_category in low_categories:
        print(task_category, low_completed_per_category[task_category], low_started_per_category[task_category] )
        avg_per_category[task_category] = low_completed_per_category[task_category] / low_started_per_category[task_category] 
        low_mean_across_categories += avg_per_category[task_category]
        low_mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(avg_per_category[task_category]), 2), 
                                                                   round(np.std(avg_per_category[task_category]), 2))
    low_mean_across_categories /= len(low_categories)
    low_mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(low_mean_across_categories), 2), round(np.std(low_mean_across_categories), 2))
    print("Low:")
    print(" & " + " & ".join([low_mean_sd_per_category[category] for category in low_mean_sd_per_category.keys()]) + '\\')


    

resnet $0.66 \pm 0.04$
High:
 & $0.07 \pm 0.04$ & $0.02 \pm 0.0$ & $0.0 \pm 0.0$ & $0.49 \pm 0.07$ & $0.01 \pm 0.01$ & $0.0 \pm 0.0$ & $0.1 \pm 0.02$\
grasp_door [50 59 49] [99 99 99]
drawer [10 19  1] [50 59 49]
grasp_block [186 171 175] [295 295 295]
contact [2 3 4] [106 106 106]
ungrasp [42 46 25] [60 64 42]
lift [2 0 1] [133 118 122]
push [2 3 2] [2 3 4]
rotate [49 45 41] [53 53 53]
place [1 0 0] [1 0 0]
Low:
 & $0.53 \pm 0.05$ & $0.18 \pm 0.12$ & $0.6 \pm 0.02$ & $0.03 \pm 0.01$ & $0.67 \pm 0.05$ & $0.01 \pm 0.01$ & $0.83 \pm 0.24$ & $0.85 \pm 0.06$ & $nan \pm nan$ & $nan \pm nan$\
voltron $0.87 \pm 0.1$
High:
 & $0.16 \pm 0.15$ & $0.08 \pm 0.06$ & $0.11 \pm 0.08$ & $0.33 \pm 0.07$ & $0.06 \pm 0.05$ & $0.0 \pm 0.0$ & $0.12 \pm 0.02$\
grasp_door [60 60 88] [99 99 99]
drawer [ 9 11 80] [60 60 88]
grasp_block [199 193 206] [295 295 295]
contact [24  9 27] [106 106 106]
ungrasp [36 25 58] [ 54  56 113]
lift [20 47  8] [142 138 163]
push [ 6  2 18] [24  9 27]
rotate [42 38 32] [57 55 4

  avg_per_category[task_category] = low_completed_per_category[task_category] / low_started_per_category[task_category]


#### Conjunction results

In [13]:
#for model in ['small', 'base']:
for model in ['resnet', 'voltron']:
    mean, sd = np.mean(conjunction_res['avg_rates'][model]), np.std(conjunction_res['avg_rates'][model])
    print(model, '${:.2}'.format(mean), "\pm", '{:.1}$'.format(sd))
    
    started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_task_categories.values())}
    completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_task_categories.values())}
    fraction_per_category = {}
    mean_sd_per_category = {}

    # 1) Get the counts for each category, within a seed.
    for i in range(3):

        for task in high_task_categories.keys():

            started_counter = conjunction_res['high_level_started'][model][i]
            if task in started_counter:
                started_per_category[high_task_categories[task]][i] += started_counter[task]
            finished_counter = conjunction_res['high_level_completed'][model][i]
            completed_per_category[high_task_categories[task]][i] += finished_counter[task]
        #rate_counter = started_counter / finished_counter



    high_categories = ['slider/drawer', 'push', 'lift', 'rotate', 'place', 'stacking']
    mean_across_categories = np.array([0., 0., 0.])

    for task_category in high_categories:
        print(task_category, completed_per_category[task_category], started_per_category[task_category])
        fraction_per_category[task_category] = completed_per_category[task_category] / started_per_category[task_category] 
        mean_across_categories += fraction_per_category[task_category]
        mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(fraction_per_category[task_category]), 2), 
                                                                   round(np.std(fraction_per_category[task_category]), 2))
        
    mean_across_categories = mean_across_categories / len(high_categories)

    mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(mean_across_categories), 2), round(np.std(mean_across_categories), 2))

    print(" & " + " & ".join([mean_sd_per_category[category] for category in mean_sd_per_category.keys()]) + '\\')
    

resnet $0.18 \pm 0.03$
slider/drawer [38 67 34] [99 99 99]
push [23 22 19] [106 106 106]
lift [ 7 13 10] [139 139 139]
rotate [16 12 12] [62 62 62]
place [0 0 0] [54 54 54]
stacking [0 0 0] [40 40 40]
 & $0.47 \pm 0.15$ & $0.2 \pm 0.02$ & $0.07 \pm 0.02$ & $0.22 \pm 0.03$ & $0.0 \pm 0.0$ & $0.0 \pm 0.0$ & $0.16 \pm 0.03$\
voltron $0.26 \pm 0.09$
slider/drawer [46 93 61] [99 99 99]
push [10 58 11] [106 106 106]
lift [26 26 18] [139 139 139]
rotate [12 19 13] [62 62 62]
place [0 0 0] [54 54 54]
stacking [0 0 2] [40 40 40]
 & $0.67 \pm 0.2$ & $0.25 \pm 0.21$ & $0.17 \pm 0.03$ & $0.24 \pm 0.05$ & $0.0 \pm 0.0$ & $0.02 \pm 0.02$ & $0.22 \pm 0.08$\


#### Random sequences results

In [None]:
#for model in ['small', 'base']:
for model in ['resnet', 'voltron']:   
    mean, sd = np.mean(random_res['avg_rates'][model]), np.std(random_res['avg_rates'][model])
    print(model)
    total_mean_sd = '${}'.format(round(mean, 2)) + " \pm " + '{}$'.format(round(sd, 2))
    chain_all = random_res['chain_results'][model]
    chain_means = np.mean(chain_all, axis=0)
    chain_sd = np.std(chain_all, axis=0)
    chain_mean_sd = ["${} \pm {}$".format(round(chain_means[i], 2), round(chain_sd[i], 2)) for i in range(5)]
    chain_mean_sd_str = " & ".join(chain_mean_sd) + " & " + total_mean_sd + " \\\\"
    print("Chain values:")
    print(chain_mean_sd_str)

    started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(low_task_categories.values())}
    completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(low_task_categories.values())}
    fraction_per_category = {}
    mean_sd_per_category = {}

    # 1) Get the counts for each category, within a seed.
    for i in range(3):
        for task in low_task_categories.keys():
            started_counter = random_res['low_level_started'][model][i]
            if task in low_started_counter:
                started_per_category[low_task_categories[task]][i] += started_counter[task]

            finished_counter = random_res['low_level_completed'][model][i]
            completed_per_category[low_task_categories[task]][i] += low_finished_counter[task]


    low_categories = ['grasp_door', 'drawer', 'grasp_block', 'contact', 'ungrasp', 'lift', 'push', 'rotate', 'place']
    avg_per_category = {}
    low_mean_across_categories =  np.array([0., 0., 0.])
    low_mean_sd_per_category = {}
    for task_category in low_categories:
        print(task_category, completed_per_category[task_category],  started_per_category[task_category] )
        avg_per_category[task_category] = completed_per_category[task_category] / started_per_category[task_category] 
        low_mean_across_categories += avg_per_category[task_category]
        low_mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(avg_per_category[task_category]), 2), 
                                                                   round(np.std(avg_per_category[task_category]), 2))
    low_mean_across_categories /= len(low_categories)
    low_mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(low_mean_across_categories), 2), round(np.std(low_mean_across_categories), 2))
    print("Low:")
    print(" & " + " & ".join([low_mean_sd_per_category[category] for category in low_mean_sd_per_category.keys()]) + '\\')



resnet
Chain values:
$0.38 \pm 0.02$ & $0.22 \pm 0.02$ & $0.1 \pm 0.01$ & $0.04 \pm 0.0$ & $0.02 \pm 0.01$ & $0.75 \pm 0.04$ \\
grasp_door [110 119 137] [198 198 198]
drawer [19 30 81] [110 119 137]
grasp_block [385 364 381] [590 590 590]
contact [26 12 31] [212 212 212]
ungrasp [78 71 83] [114 120 155]
lift [22 47  9] [275 256 285]
push [ 8  5 20] [26 12 31]
rotate [91 83 73] [110 108  96]
place [4 7 1] [11 17  3]
Low:
 & $0.62 \pm 0.06$ & $0.34 \pm 0.18$ & $0.64 \pm 0.02$ & $0.11 \pm 0.04$ & $0.6 \pm 0.06$ & $0.1 \pm 0.06$ & $0.46 \pm 0.14$ & $0.79 \pm 0.03$ & $0.37 \pm 0.03$ & $0.45 \pm 0.03$\
voltron
Chain values:
$0.51 \pm 0.05$ & $0.26 \pm 0.05$ & $0.1 \pm 0.01$ & $0.05 \pm 0.01$ & $0.02 \pm 0.01$ & $0.94 \pm 0.11$ \\
grasp_door [170 179 225] [297 297 297]
drawer [ 28  41 161] [170 179 225]
grasp_block [584 557 587] [885 885 885]
contact [50 21 58] [318 318 318]
ungrasp [114  96 141] [168 176 268]
lift [42 94 17] [417 394 448]
push [14  7 38] [50 21 58]
rotate [133 121 105] [167 

In [None]:
#### Random sequences results

#### Medium results

In [None]:
#for model in ['small', 'base']:
for model in ['resnet', 'voltron']:
    mean, sd = np.mean(medium_res['avg_rates'][model]), np.std(medium_res['avg_rates'][model])
    print(model, '${:.2}'.format(mean), "\pm", '{:.1}$'.format(sd))
    
    started_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_task_categories.values())}
    completed_per_category = {task_category: np.array([0, 0, 0]) for task_category in set(high_task_categories.values())}
    fraction_per_category = {}
    mean_sd_per_category = {}

    # 1) Get the counts for each category, within a seed.
    for i in range(3):

        for task in high_task_categories.keys():

            started_counter = medium_res['low_level_started'][model][i]
            if task in started_counter:
                started_per_category[high_task_categories[task]][i] += started_counter[task]
            finished_counter = medium_res['low_level_completed'][model][i]
            completed_per_category[high_task_categories[task]][i] += finished_counter[task]
        #rate_counter = started_counter / finished_counter



    high_categories = ['slider/drawer', 'push', 'lift', 'rotate', 'place', 'stacking']
    mean_across_categories = np.array([0., 0., 0.])

    for task_category in high_categories:
        print(task_category, completed_per_category[task_category], started_per_category[task_category])
        fraction_per_category[task_category] = completed_per_category[task_category] / started_per_category[task_category] 
        mean_across_categories += fraction_per_category[task_category]
        mean_sd_per_category[task_category] = '${} \pm {}$'.format(round(np.mean(fraction_per_category[task_category]), 2), 
                                                                   round(np.std(fraction_per_category[task_category]), 2))
        
    mean_across_categories = mean_across_categories / len(high_categories)

    mean_sd_per_category['avg'] = '${} \pm {}$'.format(round(np.mean(mean_across_categories), 2), round(np.std(mean_across_categories), 2))

    print(" & " + " & ".join([mean_sd_per_category[category] for category in mean_sd_per_category.keys()]) + '\\')


resnet $1.1 \pm 0.06$
slider/drawer [368 343 318] [402 382 391]
push [107 102  96] [186 183 178]
lift [61 57 57] [290 279 257]
rotate [28 24 33] [157 151 142]
place [0 0 0] [0 0 0]
stacking [0 0 0] [5 2 6]
 & $0.88 \pm 0.04$ & $0.56 \pm 0.01$ & $0.21 \pm 0.01$ & $0.19 \pm 0.03$ & $nan \pm nan$ & $0.0 \pm 0.0$ & $nan \pm nan$\
voltron $1.1 \pm 0.4$
slider/drawer [124 374 375] [299 399 394]
push [ 32 100  90] [135 201 193]
lift [ 46 178 114] [146 309 294]
rotate [11 35 31] [118 158 165]
place [0 0 0] [0 0 0]
stacking [0 0 0] [ 0 12  5]
 & $0.77 \pm 0.25$ & $0.4 \pm 0.12$ & $0.43 \pm 0.11$ & $0.17 \pm 0.05$ & $nan \pm nan$ & $nan \pm nan$ & $nan \pm nan$\


  fraction_per_category[task_category] = completed_per_category[task_category] / started_per_category[task_category]
