In [1]:
import sys
sys.path.append("../..")
sys.path.append("../../sac_ae")

In [2]:
import torch
import numpy as np
from IPython import display
from ipywidgets import Output, GridspecLayout
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
import pandas as pd

In [3]:
from sac_ae.env import make_envs
from sac_ae.model import make_model
from sac_ae.agent import make_agent
from utils.misc import eval_mode, VideoRecorder
from utils.argument import Arguments

In [4]:
def load_agent_and_args(path, device=torch.device('cpu'), model_name='model/best_model.pt', add_latest_args=False):
    args = Arguments(path + 'args.json')
    if add_latest_args:
        args.cost_samples = 10
        args.cost_allowed_threshold = 0.05
    weights = torch.load(path + model_name, map_location=device)

    if args.agent == 'sac_state':
        agent_obs_shape = weights['actor.encoder.projection.projection.0.weight'].shape[1:]
        args.agent_image_size = agent_obs_shape[0]
    else:
        agent_obs_shape = (3*args.frame_stack, args.agent_image_size, args.agent_image_size)
    action_shape = np.array([4])

    model = make_model(agent_obs_shape, action_shape, args, device)
    agent = make_agent(model, device, action_shape, args)
    agent.load_model_from_dict(weights)
    return agent, args

In [5]:
def one_episode(agent, args, seed, video_name=None, compute_log_std=False):
    if video_name != None:
        video = VideoRecorder('./ensemble_analysis')

    env = make_envs(args, is_eval=True, use_state=True)
    env.seed(seed)
    obs = env.reset()
    
    if video_name != None:
        video.init()
        video.record(env)

    episode_reward, episode_cost = 0, 0
    actions = np.zeros(shape=(env._max_episode_steps, 4))
    
    log_std_list = np.zeros(shape=(env._max_episode_steps, 4))

    for step in range(env._max_episode_steps):
            with eval_mode(agent):
                action = agent.select_action(obs)
                actions[step] = action
                if compute_log_std:
                    obs_torch = torch.FloatTensor(obs).to(device).unsqueeze(0)
                    mu, pi, log_pi, log_std = agent.model.actor(obs_torch, compute_log_pi=True)
                    log_std_list[step] = log_std.detach().cpu().numpy()

            next_obs, reward, done, info = env.step(action)
            if video_name != None:
                video.record(env)
            episode_reward += reward
            if args.cost != 'no_cost':
                episode_cost += info['cost']

            obs = next_obs
    if video_name != None:
        video.save(video_name)
    print(f'Reward: {np.round(episode_reward, decimals=2)}, Cost: {np.round(episode_cost, decimals=2)}')
    if compute_log_std:
        return log_std_list
    return actions

In [6]:
def calc_success_rate(agent, args, seed, num_episodes=100, stochastic=False):
    env = make_envs(args, is_eval=True, use_state=args.agent == 'sac_state')
    env.seed(seed)
    
    num_successes, reward_sum, cost_sum = 0, 0, 0
    
    for i in range(num_episodes): 
        obs = env.reset()

        episode_reward, episode_cost = 0, 0
        done = False

        while not done:
                with eval_mode(agent):
                    if stochastic:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)

                next_obs, reward, done, info = env.step(action)
                episode_reward += reward
                if args.cost != 'no_cost':
                    episode_cost += info['cost']

                obs = next_obs
        reward_sum += episode_reward
        cost_sum += episode_cost
        success = False
        if info.get('is_success'):
            num_successes += 1
            success = True
    return num_successes / num_episodes, reward_sum / num_episodes, cost_sum / num_episodes

In [7]:
seed = np.random.randint(1000000)
seed

121320

In [7]:
seed = 121320

In [8]:
push_results = pd.DataFrame(columns=['Experiment', 'ensemble_id', 'success_rate', 'mean_reward'])
push_results['ensemble_id'] = pd.to_numeric(push_results['ensemble_id'], downcast='integer')
base_path = '../../../output/final/fetch-push-final/'
experiments = ['drq_robot-True_pr-True_aa-fixed_a-05', 'drq_robot-True_pr-False_aa-False', 'drq_robot-True_pr-True_aa-fixed', 'drq_robot-True_pr-True_aa-reward-fast']

for exp in experiments:
    split = exp.split('_')
    if split[2] == 'pr-False':
        name = 'no teacher'
    if split[3] == 'aa-reward' or 'aa-reward-fast':
        name = 'adaptive alpha'
    if split[3] == 'aa-fixed':
        if len(split) >= 5 and split[4] == 'a-05':
            name = 'alpha=0.5'
        else:
            name = 'alpha=1.0'
    
    n = 3
    if exp == 'drq_robot-True_pr-False_aa-False':
        n = 1
    for i in range(n):
        path = base_path + f'{exp}_{i}/'
        agent, args = load_agent_and_args(path, device=torch.device('cuda'), model_name='model/2000000.pt', add_latest_args=True)
        success_rate, mean_reward, mean_cost = calc_success_rate(agent, args, seed, 1000, False)
        push_results.loc[len(push_results)] = [name, i, success_rate, mean_reward]

Creating window glfw
Creating window glfw
Creating window glfw


In [27]:
push_results

Unnamed: 0,Experiment,ensemble_id,success_rate,mean_reward
7,adaptive alpha,0,0.632,-7.357959
8,adaptive alpha,1,0.604,-7.757679
9,adaptive alpha,2,0.614,-7.80693
0,alpha=0.5,0,0.608,-8.421662
1,alpha=0.5,1,0.587,-8.236872
2,alpha=0.5,2,0.649,-8.356911
4,alpha=1.0,0,0.423,-11.081695
5,alpha=1.0,1,0.403,-11.361926
6,alpha=1.0,2,0.382,-11.362824
3,no teacher,0,0.071,-17.316001


In [19]:
push_results.to_pickle('results/push/eval_data_2m.pkl')

In [25]:
push_results = pd.read_pickle('results/push/eval_data_2m.pkl')

In [17]:
push_results = push_results.sort_values(by='Experiment')
push_results

Unnamed: 0,Experiment,ensemble_id,success_rate,mean_reward
7,adaptive alpha,0,0.632,-7.357959
8,adaptive alpha,1,0.604,-7.757679
9,adaptive alpha,2,0.614,-7.80693
0,alpha=0.5,0,0.608,-8.421662
1,alpha=0.5,1,0.587,-8.236872
2,alpha=0.5,2,0.649,-8.356911
4,alpha=1.0,0,0.423,-11.081695
5,alpha=1.0,1,0.403,-11.361926
6,alpha=1.0,2,0.382,-11.362824
3,no teacher,0,0.071,-17.316001


In [18]:
grouped = push_results.groupby(['Experiment']).mean()
grouped

Unnamed: 0_level_0,ensemble_id,success_rate,mean_reward
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
adaptive alpha,1.0,0.616667,-7.640856
alpha=0.5,1.0,0.614667,-8.338482
alpha=1.0,1.0,0.402667,-11.268815
no teacher,0.0,0.071,-17.316001


In [20]:
grouped.index = grouped.index.set_names(['Algorithm'])

In [21]:
print(grouped[['success_rate', 'mean_reward']].round(3).to_latex(header=['Mean Success Rate', 'Mean Episode Reward']))

\begin{tabular}{lrr}
\toprule
{} & Mean Success Rate & Mean Episode Reward \\
Algorithm      &                   &                     \\
\midrule
adaptive alpha &             0.617 &              -7.641 \\
alpha=0.5      &             0.615 &              -8.338 \\
alpha=1.0      &             0.403 &             -11.269 \\
no teacher     &             0.071 &             -17.316 \\
\bottomrule
\end{tabular}

