In [1]:
import sys
sys.path.append("../..")
sys.path.append("../../sac_ae")

In [2]:
import torch
import numpy as np
from IPython import display
from ipywidgets import Output, GridspecLayout
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
import pandas as pd

In [3]:
from sac_ae.env import make_envs
from sac_ae.model import make_model
from sac_ae.agent import make_agent
from utils.misc import eval_mode, VideoRecorder
from utils.argument import Arguments

In [4]:
def load_agent_and_args(path, device=torch.device('cpu'), model_name='model/best_model.pt'):
    args = Arguments(path + 'args.json')
    weights = torch.load(path + model_name, map_location=device)

    if args.agent == 'sac_state':
        agent_obs_shape = weights['actor.encoder.projection.projection.0.weight'].shape[1:]
        args.agent_image_size = agent_obs_shape[0]
    else:
        agent_obs_shape = (3*args.frame_stack, args.agent_image_size, args.agent_image_size)
    action_shape = np.array([4])

    model = make_model(agent_obs_shape, action_shape, args, device)
    agent = make_agent(model, device, action_shape, args)
    agent.load_model_from_dict(weights)
    return agent, args

In [5]:
def one_episode(agent, args, seed, video_name=None, compute_log_std=False):
    if video_name != None:
        video = VideoRecorder('./ensemble_analysis')

    env = make_envs(args, is_eval=True, use_state=True)
    env.seed(seed)
    obs = env.reset()
    
    if video_name != None:
        video.init()
        video.record(env)

    episode_reward, episode_cost = 0, 0
    actions = np.zeros(shape=(env._max_episode_steps, 4))
    
    log_std_list = np.zeros(shape=(env._max_episode_steps, 4))

    for step in range(env._max_episode_steps):
            with eval_mode(agent):
                action = agent.select_action(obs)
                actions[step] = action
                if compute_log_std:
                    obs_torch = torch.FloatTensor(obs).to(device).unsqueeze(0)
                    mu, pi, log_pi, log_std = agent.model.actor(obs_torch, compute_log_pi=True)
                    log_std_list[step] = log_std.detach().cpu().numpy()

            next_obs, reward, done, info = env.step(action)
            if video_name != None:
                video.record(env)
            episode_reward += reward
            if args.cost != 'no_cost':
                episode_cost += info['cost']

            obs = next_obs
    if video_name != None:
        video.save(video_name)
    print(f'Reward: {np.round(episode_reward, decimals=2)}, Cost: {np.round(episode_cost, decimals=2)}')
    if compute_log_std:
        return log_std_list
    return actions

In [6]:
def calc_success_rate(agent, args, seed, num_episodes=100, stochastic=False):
    env = make_envs(args, is_eval=True, use_state=args.agent == 'sac_state')
    env.seed(seed)
    
    num_successes, reward_sum, cost_sum = 0, 0, 0
    
    for i in range(num_episodes): 
        obs = env.reset()

        episode_reward, episode_cost = 0, 0
        done = False

        while not done:
                with eval_mode(agent):
                    if stochastic:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)

                next_obs, reward, done, info = env.step(action)
                episode_reward += reward
                if args.cost != 'no_cost':
                    episode_cost += info['cost']

                obs = next_obs
        reward_sum += episode_reward
        cost_sum += episode_cost
        success = False
        if info.get('is_success'):
            num_successes += 1
            success = True
    return num_successes / num_episodes, reward_sum / num_episodes, cost_sum / num_episodes

In [16]:
seed = np.random.randint(1000000)
seed

865110

In [18]:
experiments = []
for agent in ['sac', 'sacae', 'drq']:
    for robot in [True, False]:
        for pr in [True, False]:
            for i in range(3):
                experiments.append(f'{agent}_robot-{robot}_pr-{pr}_{i}')

In [27]:
reach_results = pd.DataFrame(columns=['agent', 'robot', 'teacher', 'ensemble_id', 'success_rate', 'mean_reward'])
reach_results['ensemble_id'] = pd.to_numeric(reach_results['ensemble_id'], downcast='integer')
base_path = '../../../output/final/fetch-reach-final/'

for agent_name in ['sac', 'sacae', 'drq']:
    for robot in [True, False]:
        for pr in [True, False]:
            for i in range(3):
                path = base_path + f'{agent_name}_robot-{robot}_pr-{pr}_{i}/'
                agent, args = load_agent_and_args(path, device=torch.device('cuda'), model_name='model/latest_model.pt')
                success_rate, mean_reward, mean_cost = calc_success_rate(agent, args, seed, 1000, False)
                reach_results.loc[len(reach_results)] = [agent_name, robot, pr, i, success_rate, mean_reward]

Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw
Creating window glfw


In [29]:
reach_results.to_pickle('results/reach/eval_data.pkl')

In [8]:
reach_results = pd.read_pickle('results/reach/eval_data.pkl')

In [9]:
reach_results

Unnamed: 0,agent,robot,teacher,ensemble_id,success_rate,mean_reward
0,sac,True,True,0,0.641,-2.852175
1,sac,True,True,1,0.644,-2.864894
2,sac,True,True,2,0.599,-3.019325
3,sac,True,False,0,0.407,-3.601424
4,sac,True,False,1,0.369,-3.692724
5,sac,True,False,2,0.402,-3.57039
6,sac,False,True,0,0.677,-2.642866
7,sac,False,True,1,0.555,-3.090759
8,sac,False,True,2,0.591,-3.101455
9,sac,False,False,0,0.446,-3.235643


In [17]:
grouped = reach_results.groupby(['agent', 'robot', 'teacher']).mean()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ensemble_id,success_rate,mean_reward
agent,robot,teacher,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
drq,False,False,1.0,0.994,-0.851949
drq,False,True,1.0,0.994333,-0.71995
drq,True,False,1.0,0.992667,-0.762509
drq,True,True,1.0,0.995667,-0.697756
sac,False,False,1.0,0.424667,-3.359879
sac,False,True,1.0,0.607667,-2.945027
sac,True,False,1.0,0.392667,-3.621513
sac,True,True,1.0,0.628,-2.912131
sacae,False,False,1.0,0.387667,-3.484158
sacae,False,True,1.0,0.657,-2.786676


In [27]:
grouped.index = grouped.index.set_names(['Agent', 'Robot', 'Teacher'])

In [30]:
print(grouped[['success_rate', 'mean_reward']].round(3).to_latex(header=['Mean Success Rate', 'Mean Reward'], multirow=True))

\begin{tabular}{lllrr}
\toprule
      &      &      & Mean Success Rate & Mean Reward \\
Agent & Robot & Teacher &                   &             \\
\midrule
\multirow{4}{*}{drq} & \multirow{2}{*}{False} & False &             0.994 &      -0.852 \\
      &      & True &             0.994 &      -0.720 \\
\cline{2-5}
      & \multirow{2}{*}{True} & False &             0.993 &      -0.763 \\
      &      & True &             0.996 &      -0.698 \\
\cline{1-5}
\cline{2-5}
\multirow{4}{*}{sac} & \multirow{2}{*}{False} & False &             0.425 &      -3.360 \\
      &      & True &             0.608 &      -2.945 \\
\cline{2-5}
      & \multirow{2}{*}{True} & False &             0.393 &      -3.622 \\
      &      & True &             0.628 &      -2.912 \\
\cline{1-5}
\cline{2-5}
\multirow{4}{*}{sacae} & \multirow{2}{*}{False} & False &             0.388 &      -3.484 \\
      &      & True &             0.657 &      -2.787 \\
\cline{2-5}
      & \multirow{2}{*}{True} & False &        