In [1]:
import sys
sys.path.append("../..")
sys.path.append("../../sac_ae")

In [2]:
import torch
import numpy as np
from IPython import display
from ipywidgets import Output, GridspecLayout
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
import pandas as pd

In [3]:
from sac_ae.env import make_envs
from sac_ae.model import make_model
from sac_ae.agent import make_agent
from utils.misc import eval_mode, VideoRecorder
from utils.argument import Arguments

In [4]:
def load_agent_and_args(path, device=torch.device('cpu'), model_name='model/best_model.pt', cost_samples=None, cost_allowed_threshold=None):
    args = Arguments(path + 'args.json')
    weights = torch.load(path + model_name, map_location=device)
    
    if cost_samples != None:
        args.cost_samples = cost_samples
    if cost_allowed_threshold != None:
        args.cost_allowed_threshold = cost_allowed_threshold
    if args.agent == 'sac_state':
        agent_obs_shape = weights['actor.encoder.projection.projection.0.weight'].shape[1:]
        args.agent_image_size = agent_obs_shape[0]
    else:
        agent_obs_shape = (3*args.frame_stack, args.agent_image_size, args.agent_image_size)
    action_shape = np.array([4])

    model = make_model(agent_obs_shape, action_shape, args, device)
    agent = make_agent(model, device, action_shape, args)
    agent.load_model_from_dict(weights)
    return agent, args

In [5]:
def one_episode(agent, args, seed, video_name=None, compute_log_std=False):
    if video_name != None:
        video = VideoRecorder('./ensemble_analysis')

    env = make_envs(args, is_eval=True, use_state=True)
    env.seed(seed)
    obs = env.reset()
    
    if video_name != None:
        video.init()
        video.record(env)

    episode_reward, episode_cost = 0, 0
    actions = np.zeros(shape=(env._max_episode_steps, 4))
    
    log_std_list = np.zeros(shape=(env._max_episode_steps, 4))

    for step in range(env._max_episode_steps):
            with eval_mode(agent):
                action = agent.select_action(obs)
                actions[step] = action
                if compute_log_std:
                    obs_torch = torch.FloatTensor(obs).to(device).unsqueeze(0)
                    mu, pi, log_pi, log_std = agent.model.actor(obs_torch, compute_log_pi=True)
                    log_std_list[step] = log_std.detach().cpu().numpy()

            next_obs, reward, done, info = env.step(action)
            if video_name != None:
                video.record(env)
            episode_reward += reward
            if args.cost != 'no_cost':
                episode_cost += info['cost']

            obs = next_obs
    if video_name != None:
        video.save(video_name)
    print(f'Reward: {np.round(episode_reward, decimals=2)}, Cost: {np.round(episode_cost, decimals=2)}')
    if compute_log_std:
        return log_std_list
    return actions

In [6]:
def calc_success_rate(agent, args, seed, num_episodes=100, stochastic=False, low_cost_action=False):
    env = make_envs(args, is_eval=True, use_state=args.agent == 'sac_state')
    env.seed(seed)
    
    num_successes, reward_sum, cost_sum = 0, 0, 0
    
    for i in range(num_episodes): 
        obs = env.reset()

        episode_reward, episode_cost = 0, 0
        done = False

        while not done:
                with eval_mode(agent):
                    if low_cost_action:
                        action = agent.select_low_cost_action(obs)
                    elif stochastic:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)

                next_obs, reward, done, info = env.step(action)
                episode_reward += reward
                if args.cost != 'no_cost':
                    episode_cost += info['cost']

                obs = next_obs
        reward_sum += episode_reward
        cost_sum += episode_cost
        success = False
        if info.get('is_success'):
            num_successes += 1
            success = True
    return num_successes / num_episodes, reward_sum / num_episodes, cost_sum / num_episodes

In [7]:
seed = np.random.randint(1000000)
seed

652508

In [7]:
seed = 652508

In [None]:
barrier_results = pd.DataFrame(columns=['algo', 'alpha', 'success_rate', 'mean_reward', 'mean_cost'])
base_path = '../../../output/final/fetch-push-barrier-drq-final/'

for cost in ['reward', 'critic-train']:
    for alpha in [0.5, 1.0]:
        if alpha==0.5:
            alpha_str = '05'
        else:
            alpha_str = '1'
        path = base_path + f'drq_{cost}_alpha-{alpha_str}/'
        agent, args = load_agent_and_args(path, device=torch.device('cuda'), model_name='model/latest_model.pt')
        success_rate, mean_reward, mean_cost = calc_success_rate(agent, args, seed, 1000, False)
        barrier_results.loc[len(barrier_results)] = [cost, alpha, success_rate, mean_reward, mean_cost]

Creating window glfw
Creating window glfw


In [13]:
barrier_results.to_pickle('results/push-barrier-drq-final/eval_data.pkl')

In [9]:
barrier_results = pd.read_pickle('results/push-barrier-drq-final/eval_data.pkl')

In [14]:
barrier_results

Unnamed: 0,algo,alpha,success_rate,mean_reward,mean_cost
0,reward,0.5,0.003,-45.531786,2.424679
3,reward,1.0,0.003,-52.985418,6.809077
1,critic-train,0.5,0.003,-40.041319,3.080384
2,critic-train,1.0,0.001,-46.976916,2.219633


In [15]:
barrier_results.replace('critic-train', 'Safety Training', inplace=True)
barrier_results.replace('reward', 'Reward Based', inplace=True)

In [16]:
barrier_results = barrier_results.rename(columns={'algo': 'Algorithm'})

In [17]:
barrier_results

Unnamed: 0,Algorithm,alpha,success_rate,mean_reward,mean_cost
0,Reward Based,0.5,0.003,-45.531786,2.424679
3,Reward Based,1.0,0.003,-52.985418,6.809077
1,Safety Training,0.5,0.003,-40.041319,3.080384
2,Safety Training,1.0,0.001,-46.976916,2.219633


In [19]:
grouped = barrier_results.groupby(['Algorithm', 'alpha']).mean()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,success_rate,mean_reward,mean_cost
Algorithm,alpha,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Reward Based,0.5,0.003,-45.531786,2.424679
Reward Based,1.0,0.003,-52.985418,6.809077
Safety Training,0.5,0.003,-40.041319,3.080384
Safety Training,1.0,0.001,-46.976916,2.219633


In [20]:
print(grouped[['success_rate', 'mean_reward', 'mean_cost']].round(3).to_latex(header=['Mean Success Rate', 'Mean Reward', 'Mean Cost'], multirow=True))

\begin{tabular}{llrrr}
\toprule
                &     & Mean Success Rate & Mean Reward & Mean Cost \\
Algorithm & alpha &                   &             &           \\
\midrule
\multirow{2}{*}{Reward Based} & 0.5 &             0.003 &     -45.532 &     2.425 \\
                & 1.0 &             0.003 &     -52.985 &     6.809 \\
\cline{1-5}
\multirow{2}{*}{Safety Training} & 0.5 &             0.003 &     -40.041 &     3.080 \\
                & 1.0 &             0.001 &     -46.977 &     2.220 \\
\bottomrule
\end{tabular}

