In [None]:
import os
import random
import numpy as np
import torch
import torchvision.transforms.functional as VF
import hydra
from omegaconf import OmegaConf
import h5py
import cv2
import matplotlib.pyplot as plt
from hydra import initialize, compose

from gazebot.utils import array2image, pcd2image
from gazebot.train import state_action_metrics, load_agent
from gazebot.agent import create_manipulation_agent

## Load Config

In [None]:
# hydra initialize
with initialize(version_base=None, config_path="../config"):
    args = compose(config_name="config.yaml")
    args.hydra_base_dir = os.getcwd()
    args.device = args.cuda_device if torch.cuda.is_available() else "cpu"


# Set seeds
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

# Set device
device = torch.device(args.device)
if device.type == "cuda" and torch.cuda.is_available() and args.cuda_deterministic:
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
torch.cuda.set_device(device)  # change default device (e.g. .cuda() or torch.cuda.IntTensor(x))

print(OmegaConf.to_yaml(args))

## Create Action Prediction Model (LocalBC)

In [None]:
state_mean, state_std, action_pose_mean, action_pose_std = state_action_metrics(args)

In [None]:
# NOTE: Use small transformer for localbc
args.agent.enc_layers = 1
args.agent.dec_layers = 1
args.agent.dim_feedforward = 1024

In [None]:
agent = create_manipulation_agent(args)

In [None]:
model_path = hydra.utils.to_absolute_path("path/to/localbc/model/dir") # NOTE Specify trained localbc path
load_agent(agent, model_path, args, ignore_error=False)

## Load Demo Dataset

In [None]:
data_dir = [os.path.expanduser(d) for d in args.expert.train_path]
data_files = sorted([os.path.join(d, f) for d in data_dir for f in os.listdir(d) if "h5" in f])

data_idxs = []
for eps_idx, episode_file in enumerate(data_files):
    with h5py.File(episode_file, "r") as e:
        if "right_state" not in e:
            break
        eps_steps = len(e["right_state"])
        if eps_steps < 2:
            break

        # change_steps: steps in which gaze transition is occurred
        if "change_steps" in e:
            change_steps = list(e["change_steps"][1:])
            # NOTE Remove gaze transition after the task is completed
            if len(change_steps) == args.expert.num_sub_task + 1:
                change_steps = change_steps[:-1]
        else:
            continue

        # episodeをchange_stepで分割
        init_step = 0
        for sub_task_idx, change_step in enumerate(change_steps):
            # Add data to dataset
            data_idx = {"file": episode_file, "init_step": init_step, "change_step": change_step, "sub_task_idx": sub_task_idx}
            data_idxs.append(data_idx)
            init_step = change_step

In [None]:
def denormalize(action):
    assert action.shape[-1] == len(action_pose_mean) == len(action_pose_std)
    return action * torch.tensor(action_pose_std, dtype=action.dtype, device=action.device) + torch.tensor(
        action_pose_mean, dtype=action.dtype, device=action.device
    )

## (1) Visualize bottleneck step & localbc loss
- You can visualize before saving localbc loss by save_localbc_loss.py

In [None]:
image_size = args.env.image_dim[::-1][:-1]

LOSS_WEIGHTS = [5.0, 5.0, 5.0, 1.0, 1.0, 1.0, 0]  # 0.005]

data_localbc_losses = [[] for _ in range(args.expert.num_sub_task)]
for i, data_idx in enumerate(data_idxs):
    print(f"\n\nLoad episode {i}: {data_idx['file']}\n")

    with h5py.File(data_idx["file"], "r") as e:
        eps_steps = len(e["left_f_state"])

        init_step = int(data_idx["init_step"])
        change_step = int(data_idx["change_step"])
        sub_task_idx = int(data_idx["sub_task_idx"])
        print("init_step, change_step:", init_step, change_step)

        # Action
        action_left_arm = np.array(e["left_f_state"][init_step + 1 : change_step, :6])  # (N, ARM_DOF)
        action_left_gripper = np.array(
            e["left_f_hstate"][init_step : change_step - 1, [6]]
        )  # (N, GRIPPER_DOF), Use hstate as action for pseudo force control of the gripper
        action_right_arm = np.array(e["right_f_state"][init_step + 1 : change_step, :6])  # (N, ARM_DOF)
        action_right_gripper = np.array(
            e["right_f_hstate"][init_step : change_step - 1, [6]]
        )  # (N, GRIPPER_DOF), Use hstate as action for pseudo force control of the gripper
        action_seq = np.concatenate((action_left_arm, action_left_gripper, action_right_arm, action_right_gripper), axis=1)  # (N, state_dim)

        localbc_losses = []
        action_norms = []
        pcds = []
        for step in range(init_step, change_step - 1):
            ### Measure bottleneck score of sampled bottleneck step ###
            # Obs
            obs = {}
            obs["state"] = np.concatenate([e["left_f_state"][step], e["right_f_state"][step]], axis=0)  # (state_dim,)
            obs["state"] = (obs["state"] - state_mean) / state_std
            obs["image"] = np.transpose(np.array(e["left_img"][step]), (2, 0, 1)) / 255.0  # [0, 1], (C, H, W)
            obs["depth"] = np.transpose(e["depth_img"][step], (2, 0, 1)).astype(np.float32)  # mm, [0, inf), (1, H, W)

            if args.expert.bgr:
                obs["image"] = np.ascontiguousarray(obs["image"][[2, 1, 0]])  # BGR2RGB

            obs["state"] = torch.as_tensor(obs["state"], dtype=torch.float, device=device)
            obs["image"] = torch.as_tensor(obs["image"], dtype=torch.float, device=device)
            obs["image"] = VF.normalize(obs["image"], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            if obs["image"].shape[1:] != torch.Size(image_size[::-1]):
                obs["image"] = VF.resize(obs["image"], size=image_size[::-1])
            obs["depth"] = torch.as_tensor(obs["depth"], dtype=torch.float, device=device)
            if obs["depth"].shape[1:] != torch.Size(image_size[::-1]):
                obs["depth"] = VF.resize(obs["depth"], size=image_size[::-1])

            # Gaze
            gaze = np.array(e["gaze"][step])  # (N, 4), [W, H]
            gaze = np.clip(gaze, [0, 0, 0, 0], [image_size[0] - 1, image_size[1] - 1, image_size[0] - 1, image_size[1] - 1])

            gaze = torch.as_tensor(gaze, dtype=torch.long, device=device)

            # Batch
            obs["state"] = obs["state"].unsqueeze(0)
            obs["image"] = obs["image"].unsqueeze(0)
            obs["depth"] = obs["depth"].unsqueeze(0)
            gaze = gaze.unsqueeze(0)

            step = step - init_step

            # LocalBC
            with torch.no_grad():
                action_hat, pcd = agent.local_bc(obs, gaze, is_inference=True)

            measure_action_length = 10  # Step

            action = action_seq[step : step + measure_action_length]  # (Ntraj, state_dim)
            action_hat = denormalize(action_hat)[0].detach().cpu().numpy()[: len(action)]  # (Ntraj, state_dim)

            # action prediction loss
            left_loss_action = np.average((action_hat[0, :7] - action[0, :7]) ** 2, weights=np.array(LOSS_WEIGHTS) ** 2)
            right_loss_action = np.average((action_hat[0, 7:] - action[0, 7:]) ** 2, weights=np.array(LOSS_WEIGHTS) ** 2)
            
            # action norm
            left_action_norm = np.linalg.norm(np.array(e["left_f_state"][step + 1]) - np.array(e["left_f_state"][step]))
            right_action_norm = np.linalg.norm(np.array(e["right_f_state"][step + 1]) - np.array(e["right_f_state"][step]))

            # Log history for visualize
            localbc_losses.append([left_loss_action, right_loss_action])
            action_norms.append([left_action_norm, right_action_norm])
            pcds.append(pcd[0])

        localbc_losses = np.array(localbc_losses)
        action_norms = np.array(action_norms)

        # Filter
        filtered_localbc_losses = np.array(
            [np.mean(localbc_losses[max(0, i - 4) : min(i + 5, len(localbc_losses))], axis=0) for i in range(len(localbc_losses))]
        )

        # normalize scale
        normalized_localbc_losses = localbc_losses * 1 / np.median(filtered_localbc_losses, axis=0)
        normalized_filtered_localbc_losses = filtered_localbc_losses * 1 / np.median(filtered_localbc_losses, axis=0)

        # LR for each subtask
        LR = [0, 0]  # NOTE Specify this

        # Detect bottleneck
        THRESH = 1.0
        MINIMUM_STEPS = int((change_step - init_step) * 0.35)
        bottleneck_step_lr = []
        for _lr in range(2):
            bottleneck_step = init_step + 1
            max_score = -np.inf
            for _step in range(init_step + 1, change_step - 2):
                score = np.sum(normalized_filtered_localbc_losses[: _step - init_step, _lr] > THRESH) + np.sum(
                    normalized_filtered_localbc_losses[_step - init_step :, _lr] < THRESH
                )
                if score > max_score:
                    max_score = score
                    bottleneck_step = _step
            if change_step - bottleneck_step < MINIMUM_STEPS:
                bottleneck_step = init_step + 1
            bottleneck_step_lr.append(bottleneck_step)

        if LR[sub_task_idx] == 0:
            # (Left)
            plt.plot(normalized_localbc_losses[:, 0], label="localbc")
            plt.plot(normalized_filtered_localbc_losses[:, 0], label="filtered")
            plt.legend()
            plt.hlines([THRESH], xmin=0, xmax=len(localbc_losses), colors="black")
            plt.vlines([bottleneck_step_lr[0] - init_step], ymin=-10, ymax=10, colors="red")
            plt.ylim(0, 3)
            plt.show()

            # (Left)
            plt.plot(action_norms[:, 0], label="action norm")
            plt.legend()
            # plt.hlines([THRESH], xmin=0, xmax=len(action_norms), colors="black")
            plt.vlines([bottleneck_step_lr[0] - init_step], ymin=-10, ymax=10, colors="red")
            plt.ylim(0, 3)
            plt.show()

            image = np.array(e["left_img"][bottleneck_step_lr[0]])
            if args.expert.bgr:
                image = np.ascontiguousarray(image[:, :, [2, 1, 0]])  # BGR2RGB
            predict_gaze = np.array(e["gaze"][bottleneck_step_lr[0]])[:2]  # np.array(e["predict_gaze"][bottleneck_step_lr[0]])[:2]  # (2,)
            image = cv2.circle(image, (int(predict_gaze[0]), int(predict_gaze[1])), 20, (255, 255, 255), 5)
            display(array2image(image).resize((320, 180)))

            display(pcd2image(pcds[bottleneck_step_lr[0] - init_step]))

        else:
            # (Right)
            plt.plot(normalized_localbc_losses[:, 1], label="localbc")
            plt.plot(normalized_filtered_localbc_losses[:, 1], label="filtered")
            plt.legend()
            plt.hlines([THRESH], xmin=0, xmax=len(localbc_losses), colors="black")
            plt.vlines([bottleneck_step_lr[1] - init_step], ymin=-10, ymax=10, colors="red")
            plt.ylim(0, 3)
            plt.show()

            image = np.array(e["left_img"][bottleneck_step_lr[1]])
            if args.expert.bgr:
                image = np.ascontiguousarray(image[:, :, [2, 1, 0]])  # BGR2RGB
            predict_gaze = np.array(e["gaze"][bottleneck_step_lr[1]])[:2]  # np.array(e["predict_gaze"][bottleneck_step_lr[0]])[:2]  # (2,)
            image = cv2.circle(image, (int(predict_gaze[0]), int(predict_gaze[1])), 20, (255, 255, 255), 5)
            display(array2image(image).resize((320, 180)))

            display(pcd2image(pcds[bottleneck_step_lr[1] - init_step]))

        print("bottleneck_step_lr:", bottleneck_step_lr)

        plt.clf()
        plt.close()

        data_localbc_losses[sub_task_idx].append(filtered_localbc_losses)

## (2) Visualize bottleneck step & localbc loss
- This is faster if you already saved localbc_loss by save_localbc_loss.py

In [None]:
image_size = args.env.image_dim[::-1][:-1]

for i, data_idx in enumerate(data_idxs):
    if i > 50:
        break

    print(f"\n\nLoad episode {i}: {data_idx['file']}\n")
    with h5py.File(data_idx["file"], "r") as e:
        eps_steps = len(e["left_f_state"])

        init_step = data_idx["init_step"]
        change_step = data_idx["change_step"]
        sub_task_idx = data_idx["sub_task_idx"]
        print("sub_task_idx:", sub_task_idx, ", init_step, change_step:", init_step, change_step)

        if "localbc_loss" not in e:
            continue

        # Detect bottleneck
        THRESH = 1.0
        MINIMUM_STEPS = int((change_step - init_step) * 0.35)  # 25
        bottleneck_step_lr = []
        for _lr in range(2):
            localbc_loss = np.array(e["localbc_loss"][init_step:change_step, _lr])
            # Filter
            filtered_localbc_loss = np.array([np.mean(localbc_loss[max(0, i - 4) : min(i + 5, len(localbc_loss))]) for i in range(len(localbc_loss))])
            # normalize scale
            normalized_localbc_loss = localbc_loss * 1 / np.median(filtered_localbc_loss)
            normalized_filtered_localbc_loss = filtered_localbc_loss * 1 / np.median(filtered_localbc_loss)

            # Bottleneck Detection 
            bottleneck_step = init_step + 1
            max_score = -np.inf
            for _step in range(init_step + 1, change_step - 2):
                score = np.sum(normalized_filtered_localbc_loss[: _step - init_step] > THRESH) + np.sum(
                    normalized_filtered_localbc_loss[_step - init_step :] < THRESH
                )
                if score > max_score:
                    max_score = score
                    bottleneck_step = _step
            if change_step - bottleneck_step < MINIMUM_STEPS:
                bottleneck_step = init_step + 1
            bottleneck_step_lr.append(bottleneck_step)

            # NOTE Specify this
            LR = [0, 0]
            if _lr == LR[sub_task_idx]:
                # Pcd
                obs = {}
                state = np.concatenate([e["left_f_state"][bottleneck_step], e["right_f_state"][bottleneck_step]], axis=0)  # (state_dim,)
                image = np.array(e["left_img"][bottleneck_step])
                depth = np.array(e["depth_img"][bottleneck_step][:, :, 0])  # mm, [0, inf)

                if args.expert.bgr:
                    image = np.ascontiguousarray(image[:, :, [2, 1, 0]])  # BGR2RGB

                gaze = np.array(e["gaze"][bottleneck_step])  # np.array(e["predict_gaze"][bottleneck_step])  # (4,), [W, H]
                gaze = np.clip(gaze, [0, 0, 0, 0], [image_size[0] - 1, image_size[1] - 1, image_size[0] - 1, image_size[1] - 1])

                # Visualize
                plt.plot(normalized_localbc_loss, label="raw")
                plt.plot(normalized_filtered_localbc_loss, label="filtered")
                plt.hlines([THRESH], xmin=0, xmax=len(localbc_loss), colors="black")
                # plt.hlines([np.median(localbc_loss) * 1 / np.median(filtered_localbc_loss)], xmin=0, xmax=len(localbc_loss), colors="green")
                plt.ylim(0, 3)
                plt.vlines([bottleneck_step - init_step], ymin=-10, ymax=10, colors="red")
                plt.legend()
                plt.show()

                print("bottleneck_step:", bottleneck_step - init_step)

                image = np.array(e["left_img"][bottleneck_step])
                if args.expert.bgr:
                    image = np.ascontiguousarray(image[:, :, [2, 1, 0]])  # BGR2RGB
                predict_gaze = np.array(e["predict_gaze"][bottleneck_step])[:2]  # (2,)
                image = cv2.circle(image, (predict_gaze[0], predict_gaze[1]), 20, (255, 255, 255), 5)
                display(array2image(image).resize((320, 180)))

                plt.clf()
                plt.close()