### 1. Importing Dependencies

In [1]:
import gym
import numpy as np

from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
from stable_baselines3 import HerReplayBuffer, SAC
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold, CallbackList
from stable_baselines3.her.her_replay_buffer import DictReplayBuffer
from stable_baselines3.common.monitor import Monitor
from typing import Any, Callable

import os

### 2. Testing Environment

In [38]:
environment_name = "FetchPickAndPlace-v1"

In [39]:
log_path = os.path.join('Training', 'Logs-2')

In [40]:
env = Monitor(gym.make(environment_name))

In [41]:
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-2, verbose=1)

In [42]:
eval_callback = EvalCallback(env, best_model_save_path=log_path,
                             log_path=log_path, eval_freq=12000, callback_on_new_best=callback_on_best)

In [43]:
env.action_space.sample()

array([ 0.11762951, -0.34023935, -0.44053447, -0.18788737], dtype=float32)

In [44]:
env.observation_space.sample()

OrderedDict([('achieved_goal',
              array([ 0.7213227, -0.9876719,  1.1325431], dtype=float32)),
             ('desired_goal',
              array([0.46884996, 0.13244456, 1.2508638 ], dtype=float32)),
             ('observation',
              array([-0.3212221 , -0.45945266,  0.67389446,  0.70562613,  0.6267838 ,
                     -0.8263457 , -0.27775422, -0.59260035, -0.38706642, -0.04525652,
                      1.8288016 ,  1.503531  ,  1.3220016 , -3.5423703 , -0.42946556,
                      0.98484045,  0.61054313,  0.9122507 ,  0.64001834, -0.5057297 ,
                      0.52541274,  0.44773683,  0.418074  ,  1.2150637 ,  0.43818712],
                    dtype=float32))])

### 3. Training Model

In [45]:
env = DummyVecEnv([lambda: env])

In [46]:
n_sampled_goal = 4

In [47]:
tensorboard_log = os.path.join('tensor_board')

In [48]:
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    def func(progress_remaining: float) -> float:
        if(progress_remaining >= 3/4):
            return initial_value
        if(progress_remaining >= 2/4):
            return initial_value / 2
        if(progress_remaining >= 1/4):
            return initial_value / 4
        else:
            return initial_value / 8
    return func

In [49]:
model = SAC(
    "MultiInputPolicy",
    env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
      n_sampled_goal=n_sampled_goal,
      goal_selection_strategy="future",
      online_sampling=True,
    ),
    verbose=0,
    buffer_size=int(1e6),
    learning_rate=linear_schedule(1e-4),
    gamma=0.95,
    batch_size=256,
    policy_kwargs=dict(net_arch=[256, 256, 256]),
    tensorboard_log=tensorboard_log
)

In [17]:
model.learn(int(4e6), callback=eval_callback)

2022-04-05 09:47:36.092253: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/etni/dev/pos/lib/python3.8/site-packages/cv2/../../lib64::/home/etni/.mujoco/mujoco210/bin
2022-04-05 09:47:36.092313: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Eval num_timesteps=12000, episode_reward=-50.00 +/- 0.00
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
New best mean reward!
Eval num_timesteps=24000, episode_reward=-50.00 +/- 0.00
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
Eval num_timesteps=36000, episode_reward=-40.00 +/- 20.00
Episode length: 50.00 +/- 0.00
Success rate: 20.00%
New best mean reward!
Eval num_timesteps=48000, episode_reward=-50.00 +/- 0.00
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
Eval num_timesteps=60000, episode_reward=-50.00 +/- 0.00
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
Eval num_timesteps=72000, episode_reward=-30.00 +/- 24.49
Episode length: 50.00 +/- 0.00
Success rate: 40.00%
New best mean reward!
Eval num_timesteps=84000, episode_reward=-41.00 +/- 18.00
Episode length: 50.00 +/- 0.00
Success rate: 20.00%
Eval num_timesteps=96000, episode_reward=-50.00 +/- 0.00
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
Eval num_timesteps=108000, episode_reward=-50.00 +/- 0.00
Episod

<stable_baselines3.sac.sac.SAC at 0x7ffce1adcc10>

### 4. Evaluating Last Model

In [18]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

Creating window glfw


### 5. Reloading Best Model and Recording Video

In [50]:
del model

In [51]:
model = SAC.load(os.path.join(log_path, 'best_model.zip'), env)

In [52]:
video_path = os.path.join('video')

In [54]:
video_length = 12000

env = VecVideoRecorder(env,
                       video_path,
                       record_video_trigger=lambda x: x == 0,
                       video_length=video_length,
                       name_prefix=f"final-agent-{environment_name}")

result = evaluate_policy(model, env, n_eval_episodes=10, render=False)
# Save the video
env.close()

In [23]:
result = evaluate_policy(model, env, n_eval_episodes=50, render=True)
env.close()
print('Score: {}'.format(result[0]))

Creating window glfw
Score: -8.2
