In [1]:
import gymnasium
import push_box
import pybullet as p
import pybullet_data
import time
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os
from typing import Any, Dict
import torch as th

pybullet build time: Nov 28 2023 23:48:36


In [None]:
p.disconnect()

In [2]:
log_path = os.path.join('Training', 'Logs')

In [3]:
env = gymnasium.make('pushBox-v0')
env = DummyVecEnv([lambda: env])

In [4]:
# Add a callback to training stage for early stopping
save_path = os.path.join('Training', 'SavedModels', 'PPO_20_best')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold = 500, verbose = 1)
eval_callback = EvalCallback(env, 
                            callback_on_new_best = stop_callback,
                            eval_freq = 10000, 
                            best_model_save_path = save_path, 
                            verbose = 1)


In [5]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [6]:
model.learn(total_timesteps=10000000, callback=eval_callback)

Logging to Training/Logs/PPO_20
-----------------------------
| time/              |      |
|    fps             | 294  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 261         |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015123434 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | 0.114       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0188     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0213     |
|    value_loss           | 0.000382    |
-----------------------------------------
--



Eval num_timesteps=10000, episode_reward=-0.50 +/- 0.00
Episode length: 50000.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 5e+04       |
|    mean_reward          | -0.5        |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.008886537 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.48       |
|    explained_variance   | -0.443      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00449     |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.00466    |
|    value_loss           | 6.4e-05     |
-----------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 26    |
|    iterations      | 5     |
|    tim

<stable_baselines3.ppo.ppo.PPO at 0x7f0212a56790>

In [7]:
PPO_Path = os.path.join('Training', 'SavedModels', 'PPO_20_10M_exist')

In [8]:
model.save(PPO_Path)

In [None]:
del model

In [9]:
model = PPO.load(PPO_Path, env=env)

In [11]:
evaluate_policy(model, env, n_eval_episodes=5)

(-1.3295725179434783, 4.240511800613193)