<h1><center><u>IU-WeightedSAC -- Centauro Tray Environment</u></center></h1> 

In [1]:
import os
import joblib
import importlib
import numpy as np
import matplotlib.pyplot as plt

%matplotlib nbagg
# %matplotlib notebook

# from robolearn.envs.simple_envs.goal_composition import GoalCompositionEnv
# from robolearn_gym_envs.pybullet.manipulator2d3dof import Pusher2D3DofGoalCompoEnv
from robolearn_gym_envs.pybullet.centauro import CentauroTrayEnv
from robolearn.envs.normalized_box_env import NormalizedBoxEnv

from robolearn.torch.models import NNQFunction, NNVFunction
from robolearn.torch.models import NNMultiQFunction, NNMultiVFunction

from robolearn.torch.policies import TanhGaussianWeightedMultiPolicy

from robolearn.torch.rl_algos.sac.iu_weightedmultisac import IUWeightedMultiSAC

from robolearn.utils.data_management import MultiGoalReplayBuffer

from robolearn.utils.launchers.launcher_util import setup_logger
import robolearn.torch.pytorch_util as ptu
from robolearn.core import logger
from robolearn.utils.plots import get_csv_data
from robolearn.utils.plots import subplots

In [2]:


def create_environment(goal, tgt_pose, sim_timestep=0.01, frame_skip=1, seed=10):
    
    env_params = dict(
        is_render=False,
        obs_with_img=False,
        goal_poses=[goal, (goal[0], 'any'), ('any', goal[1])],
        rdn_goal_pose=True,
        tgt_pose=tgt_pose,
        rdn_tgt_object_pose=True,
        sim_timestep=sim_timestep,
        frame_skip=frame_skip,
        obs_distances=True,
        tgt_cost_weight=1.0,
        goal_cost_weight=1.5,
        ctrl_cost_weight=1.0e-4,
        use_log_distances=True,
        log_alpha=1.e-1,
        max_time=None,
        subtask=None,
        seed=seed,
    )
    
    # Environment
    env = CentauroTrayEnv(
        **env_params,
    )

    # Normalize environment
    env = NormalizedBoxEnv(
        env,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    # Visualize costs
#     env.reset()
#     env.render()
#     env.close()
    return env, env_params

In [3]:
def create_value_fcns(net_size=32, n_hidden=2, n_shared_hidden=0):
    # --------------------- #
    # Value Function Models #
    # --------------------- #
    # Unintentional Value functions --> Environment sub-goals (sub-tasks)
    u_qf = NNMultiQFunction(obs_dim=obs_dim,
                           action_dim=action_dim,
                           n_qs=n_unintentions,
                           shared_hidden_sizes=[net_size for _ in range(n_shared_hidden)],
                           unshared_hidden_sizes=[net_size for _ in range(n_hidden)],
                           )

    u_qf2 = NNMultiQFunction(obs_dim=obs_dim,
                            action_dim=action_dim,
                            n_qs=n_unintentions,
                            shared_hidden_sizes=[net_size for _ in range(n_shared_hidden)],
                            unshared_hidden_sizes=[net_size for _ in range(n_hidden)],
                            )
    u_vf = NNMultiVFunction(obs_dim=obs_dim,
                           n_vs=n_unintentions,
                           shared_hidden_sizes=[net_size for _ in range(n_shared_hidden)],
                           unshared_hidden_sizes=[net_size for _ in range(n_hidden)],
                           )

    # Intentional Value function --> Environment goal (full task)
    i_qf = NNQFunction(obs_dim=obs_dim,
                      action_dim=action_dim,
                      hidden_sizes=[net_size for _ in range(n_hidden)])

    i_qf2 = NNQFunction(obs_dim=obs_dim,
                       action_dim=action_dim,
                       hidden_sizes=[net_size for _ in range(n_hidden)])

    i_vf = NNVFunction(obs_dim=obs_dim,
                      hidden_sizes=[net_size for _ in range(n_hidden)])
    
    return u_qf, u_qf2, u_vf, i_qf, i_qf2, i_vf

In [4]:
def create_policy(
    net_size=32,
    n_shared_layers=0,
    n_unshared_layers=2,
    n_mix_layers=2,
    shared_norm=False,
    unshared_norm=False,
    mix_norm=False,
):
    # ------ #
    # Policy #
    # ------ #
    policy = TanhGaussianWeightedMultiPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        n_policies=n_unintentions,
        shared_hidden_sizes=[net_size for _ in range(n_shared_layers)],
        unshared_hidden_sizes=[net_size for _ in range(n_unshared_layers)],
        unshared_mix_hidden_sizes=[net_size for _ in range(n_mix_layers)],
        stds=None,
        shared_layer_norm=shared_norm,
        unshared_layer_norm=unshared_norm,
        mixture_layer_norm=mix_norm,
        mixing_temperature=1.,
        reparameterize=True,
    )
    return policy

In [5]:
def create_replay_buffer(env, replay_buffer_size):
    n_unintentions = env.n_subgoals
    obs_dim = np.prod(env.observation_space.shape)
    action_dim = np.prod(env.action_space.shape)
    
    # Replay Buffer
    replay_buffer = MultiGoalReplayBuffer(
        max_replay_buffer_size=replay_buffer_size,
        obs_dim=obs_dim,
        action_dim=action_dim,
        reward_vector_size=n_unintentions,
    )
    return replay_buffer

In [6]:
def create_algorithm(script_params, algo_hyperparams, seed=10, use_gpu=False):
    env = script_params['env']
    
    env.seed(seed)
    ptu.seed(seed)

    ptu.set_gpu_mode(use_gpu)
    
    # Algorithm
    algorithm = IUWeightedMultiSAC(
        **script_params,
        **algo_hyperparams,
    )
    
    if ptu.gpu_enabled():
        algorithm.cuda()
    
    return algorithm

# IU-SAC

In [7]:
seed = 10
Tend = 3.  # Seconds
sim_timestep = 0.01
frame_skip = 1
goal = (0.65, 0.65)
tgt_pose = (0.5, 0.25, 1.4660)

env, env_params = create_environment(goal=goal, tgt_pose=tgt_pose, sim_timestep=sim_timestep, frame_skip=frame_skip, seed=seed)

In [8]:
# Common parameters
n_unintentions = env.n_subgoals
obs_dim = np.prod(env.observation_space.shape)
action_dim = np.prod(env.action_space.shape)

In [9]:
# Hyperparameters
net_size = 32
u_qf, u_qf2, u_vf, i_qf, i_qf2, i_vf = create_value_fcns(net_size)

In [10]:
# Policy Hyperparameters
policy_hyperparams = dict(
net_size=32,
n_shared_layers = 0,
n_unshared_layers = 2,
n_mix_layers = 2,
shared_norm = False,
unshared_norm = False,
mix_norm = False, 
)
policy = create_policy(**policy_hyperparams)

In [11]:
# Replay Buffer
replay_buffer_size = 1e6
replay_buffer = create_replay_buffer(env, replay_buffer_size)

In [12]:
# --------- #
# Algorithm #
# --------- #

# Notebook Hypeparameters
render = False
n_epochs = 1500
batch_size=256

reward_scale=7.0e-1
u_reward_scales=[7.0e-1, 7.0e-1]

i_entropy_scale=1.0e-0
u_entropy_scale=[1.0e-0, 1.0e-0]

paths_per_epoch = 5
paths_per_eval = 3
dt = sim_timestep * frame_skip
path_length = int(np.ceil(Tend/dt))

log_tensorboard = False

# ALGORITHM HYPERPARAMETERS
algo_hyperparams = dict(
    # Common RL algorithm params
    num_steps_per_epoch=paths_per_epoch * path_length,
    num_epochs=n_epochs,
    num_updates_per_train_call=1,
    num_steps_per_eval=paths_per_eval * path_length,
    # EnvSampler params
    max_path_length=path_length,
    render=render,
    # SAC params
    min_steps_start_train=batch_size,
    min_start_eval=paths_per_epoch * path_length,
    reparameterize=True,
    action_prior='uniform',
    i_entropy_scale=i_entropy_scale,
    u_entropy_scale=u_entropy_scale,
    
    discount=0.99,
    reward_scale=reward_scale,
    u_reward_scales=u_reward_scales,
    
    log_tensorboard=log_tensorboard,
)

# SCRIPT HYPERPARAMETERS
script_params = dict(
    env=env,
    policy=policy,
    u_qf=u_qf,
    u_vf=u_vf,
    replay_buffer=replay_buffer,
    batch_size=batch_size,
    i_qf=i_qf,
    i_vf=i_vf,
    u_qf2=u_qf2,
    i_qf2=i_qf2,
    eval_env=env,
    save_environment=False,
)

# Add env_hyperparams temporally
algo_hyperparams['env_params'] = env_params

# Logger
log_dir = setup_logger(
#     'notebook_2d_nav_'+str(type(algorithm).__name__),
    'notebook_2d3dofpusher_'+str(IUWeightedMultiSAC.__name__),
    variant=algo_hyperparams,
    snapshot_mode='gap_and_last',
    snapshot_gap=25,
    log_dir=None,
    log_stdout=False,
)
print('Log directory is:', log_dir)

# Now popup env_params
algo_hyperparams.pop('env_params')


2018-09-24 16:40:43.330147 CEST | Variant:
2018-09-24 16:40:43.330730 CEST | {
  "num_steps_per_epoch": 1500,
  "num_epochs": 1500,
  "num_updates_per_train_call": 1,
  "num_steps_per_eval": 900,
  "max_path_length": 300,
  "render": false,
  "min_steps_start_train": 256,
  "min_start_eval": 1500,
  "reparameterize": true,
  "action_prior": "uniform",
  "i_entropy_scale": 1.0,
  "u_entropy_scale": [
    1.0,
    1.0
  ],
  "discount": 0.99,
  "reward_scale": 0.7,
  "u_reward_scales": [
    0.7,
    0.7
  ],
  "log_tensorboard": false,
  "env_params": {
    "is_render": false,
    "obs_with_img": false,
    "goal_poses": "[(0.65, 0.65), (0.65, 'any'), ('any', 0.65)]",
    "rdn_goal_pose": true,
    "tgt_pose": [
      0.5,
      0.25,
      1.466
    ],
    "rdn_tgt_object_pose": true,
    "sim_timestep": 0.01,
    "frame_skip": 1,
    "obs_distances": true,
    "tgt_cost_weight": 1.0,
    "goal_cost_weight": 1.5,
    "ctrl_cost_weight": 0.0001,
    "use_log_distances": true,
    "log_a

{'is_render': False,
 'obs_with_img': False,
 'goal_poses': [(0.65, 0.65), (0.65, 'any'), ('any', 0.65)],
 'rdn_goal_pose': True,
 'tgt_pose': (0.5, 0.25, 1.466),
 'rdn_tgt_object_pose': True,
 'sim_timestep': 0.01,
 'frame_skip': 1,
 'obs_distances': True,
 'tgt_cost_weight': 1.0,
 'goal_cost_weight': 1.5,
 'ctrl_cost_weight': 0.0001,
 'use_log_distances': True,
 'log_alpha': 0.1,
 'max_time': None,
 'subtask': None,
 'seed': 10}

In [None]:
start_epoch = 0

use_gpu = True


algorithm = create_algorithm(script_params, algo_hyperparams, seed=seed, use_gpu=use_gpu)

algorithm.train(start_epoch=start_epoch, train_bar=True)

HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))

In [None]:
# Test
deterministic = True
subpolicy = 1 # None or int
print('Max path length:', path_length)

env.close()
obs = env.reset()
for t in range(path_length):
    env.render()
    action, pol_info = policy.get_action(obs, pol_idx=subpolicy)
    obs, reward, done, env_info = env.step(action)
#     print('obs:', obs, '| goal:', env.wrapped_env.goal_position, ' | reward:', reward)
#     print('---')
    if done:
        print('Environment done!')
        break

In [None]:
def plot_v_fcn():
    xlim = (-7, 7)
    ylim = (-7, 7)
    delta = 0.01
    x_min, x_max = tuple(1.1 * np.array(xlim))
    y_min, y_max = tuple(1.1 * np.array(ylim))
    all_x = np.arange(x_min, x_max, delta)
    all_y = np.arange(y_min, y_max, delta)
    xy_mesh = np.meshgrid(all_x, all_y)
    all_obs = np.array(xy_mesh).T.reshape(-1, 2)
    
    def plot_v_contours(ax, values):
        values = values.reshape(len(all_x), len(all_y))

        contours = ax.contour(xy_mesh[0], xy_mesh[1], values, 20,
                          colors='dimgray')
        ax.clabel(contours, inline=1, fontsize=10, fmt='%.0f')
        ax.imshow(values, extent=(x_min, x_max, y_min, y_max), origin='lower',
                  alpha=0.5)
        
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.axis('equal')
        ax.set_aspect('equal', 'box')
    
    # Compute and plot Main Task Value-fcn
    values, _ = i_vf.get_values(all_obs)
    
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ax.set_title('V-function Main Task')
    plot_v_contours(ax, values)
    
    # Compute and plot Sub-tasks Value-fcn
    n_cols = 2
    n_rows = int(np.ceil(n_unintentions/n_cols))
    subgoals_fig, subgoals_axs = plt.subplots(n_rows, n_cols)
    
    subgoals_axs = np.atleast_2d(subgoals_axs)
    
    
    for aa in range(n_unintentions):
        row = aa // n_cols
        col = aa % n_cols
        subgo_ax = subgoals_axs[row, col]
        values, _ = u_vf.get_values(all_obs, val_idxs=[aa])
        values = values[0]
        
        subgo_ax.set_title('V-function Sub-Task %02d' % aa)
        plot_v_contours(subgo_ax, values)
        
    
plot_v_fcn()

In [None]:
def plot_q_fcn(obs):
    delta = 0.01
    x_min, y_min = env.action_space.low
    x_max, y_max = env.action_space.high
    xlim = (x_min, x_max)
    ylim = (y_min, y_max)
    all_x = np.arange(x_min, x_max, delta)
    all_y = np.arange(y_min, y_max, delta)
    xy_mesh = np.meshgrid(all_x, all_y)

    all_acts = np.zeros((len(all_x)*len(all_y), 2))
    all_acts[:, 0] = xy_mesh[0].ravel()
    all_acts[:, 1] = xy_mesh[1].ravel()
     
    all_obs = np.broadcast_to(obs, (all_acts.shape[0], 2))
    
    def plot_q_contours(ax, q_values):
        q_values = q_values.reshape(len(all_x), len(all_y))

        contours = ax.contour(xy_mesh[0], xy_mesh[1], q_values, 20,
                          colors='dimgray')
        ax.clabel(contours, inline=1, fontsize=10, fmt='%.0f')
        ax.imshow(q_values, extent=(x_min, x_max, y_min, y_max), origin='lower',
                  alpha=0.5)
        
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.axis('equal')
        ax.set_aspect('equal', 'box')
    
    def plot_action_samples(ax, actions):
        x, y = actions[:, 0], actions[:, 1]
        ax.scatter(x, y, c='b', marker='*')
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
   
    fig, all_axs = plt.subplots(1, n_unintentions + 1)
    fig.suptitle('Observation: ' +  str(obs))

    # Compute and plot Main Task State Value-fcn
    all_axs[0].set_title('Main Task')
    q_values, _ = i_qf.get_values(all_obs, all_acts)
    
    plot_q_contours(all_axs[0], q_values)
    
    # Compute and plot Main Task State-Action Value-fcn
    action_samples, _ = policy.get_actions(all_obs[:50, :], pol_idx=None)
    plot_action_samples(all_axs[0], action_samples)
    
    for aa in range(n_unintentions):
        subgo_ax = all_axs[aa + 1]
        subgo_ax.set_title('Sub-Task %02d' % aa)
        
        q_values, _ = u_qf.get_values(all_obs, all_acts, val_idxs=[aa])
        q_values = q_values[0]        
        plot_q_contours(subgo_ax, q_values)
        
        action_samples, _ = policy.get_actions(all_obs[:20, :], pol_idx=aa)
        plot_action_samples(subgo_ax, action_samples)       

all_obs = [
    [5., 5.],
    [-4., 5.],
    [5., -4.],
    [-4., -4.],
    [0., 0.],
]

for obs in all_obs:
    plot_q_fcn(obs)
    print('---')

In [None]:
snapshot_dir = logger.get_snapshot_dir()

data_file = os.path.join(snapshot_dir, 'progress.csv')

def plot_data(csv_file, label='Policy Entropy', plot_intentional=True):
    labels_to_plot = list()
    
    for uu in range(n_unintentions):
        labels_to_plot.append(('[U-%02d] ' % uu) + label)
        
    if plot_intentional:
        n_subplots = n_unintentions + 1
        labels_to_plot.append('[I] ' + label)
    else:
        n_subplots = n_unintentions

    data = get_csv_data(csv_file, labels_to_plot)

    fig, axs = subplots(n_subplots)
    if not isinstance(axs, np.ndarray):
        axs = np.array([axs])
    fig.subplots_adjust(hspace=0)
    fig.suptitle(label,
                 fontweight='bold')

    for aa, ax in enumerate(axs):
        ax.plot(data[aa])
        if aa < n_unintentions:
            ylabel = 'Un-%02d' % uu
        else:
            ylabel = 'In'
        ax.set_ylabel(ylabel)
        plt.setp(ax.get_xticklabels(), visible=False)

    axs[-1].set_xlabel('Episodes')
    plt.setp(axs[-1].get_xticklabels(), visible=True)

    
plot_data(data_file, label='Test Returns Mean')
print('--')
plot_data(data_file, label='Test Rewards Mean')
print('--')
plot_data(data_file, label='Policy Entropy')
print('--')
plot_data(data_file, label='Mixing Weights', plot_intentional=False)
