# Test the training procedure of CatBot
Based on: 
https://github.com/RussTedrake/manipulation/blob/master/rl/box_flipup.ipynb

Some insight into tensorboard plots:
https://medium.com/aureliantactics/understanding-ppo-plots-in-tensorboard-cbc3199b9ba2

Use tensorboard --logdir \(LOGDIR\)

In [1]:
import os
import gym
import numpy as np
import datetime
import torch
from pydrake.all import StartMeshcat

from catbot.RL.catbot_rl_env import CatBotEnv

from psutil import cpu_count

num_cpu = int(cpu_count() / 2)

# Optional imports (these are heavy dependencies for just this one notebook)
sb3_available = False
try:
    from stable_baselines3 import PPO
    from stable_baselines3.common.vec_env import SubprocVecEnv
    from stable_baselines3.common.env_util import make_vec_env
    from stable_baselines3.common.monitor import Monitor
    from stable_baselines3.common.callbacks import CheckpointCallback

    sb3_available = True
except ImportError:
    print("stable_baselines3 not found")
    print("Consider 'pip3 install stable_baselines3'.")


In [2]:
meshcat = StartMeshcat()

gym.envs.register(
    id="CatBot-v0", entry_point="catbot.RL.catbot_rl_env:CatBotEnv"
)

INFO:drake:Meshcat listening for connections at http://localhost:7000


In [5]:
observations = "state"
time_limit = 4
total_timesteps = 10000

env = make_vec_env(
    CatBotEnv,
    n_envs=8,  # Was num_cpu
    seed=0,
    vec_env_cls=SubprocVecEnv,
    env_kwargs={
        "observations": observations,
        "time_limit": time_limit,
    },
)

checkpoint_callback = CheckpointCallback(
    save_freq=1000,
    save_path='./checkpoints/',
    name_prefix='PPO_CHECKPOINT_TEST',
    save_replay_buffer=True,
    save_vecnormalize=True,
)

use_pretrained_model = False 
model_zip_fname = "./models/PPO_C6_0254.zip"
if use_pretrained_model:
    model = PPO.load(model_zip_fname, env)
    model.learn(total_timesteps=total_timesteps, progress_bar=True, callback=checkpoint_callback)
else:
    model = PPO(
        "MlpPolicy",
        env,
        # env,
        verbose=0,
        n_steps=4,
        n_epochs=2,
        batch_size=32,
        tensorboard_log="./ppo_cat_bot_logs")
    model.learn(total_timesteps=total_timesteps, progress_bar=True, callback=checkpoint_callback)


Output()

## Save Model

In [None]:
save_dir = './models'
saved_model_fnames = os.listdir(save_dir)

datetime_val = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
model_fname = f'{datetime_val}_PPO.zip'

save_fname = f'./models/{model_fname}'
model.save(save_fname)
print(f'Saved model to: {model_fname}')

Saved model to: 230510_030745_PPO.zip


### Show just trained model

In [None]:
env = gym.make("CatBot-v0", meshcat=meshcat, observations=observations, time_limit=time_limit)
env.simulator.set_target_realtime_rate(1.0)

obs = env.reset()
for i in range(500):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()

# Show pretrained model

In [12]:
model_zip_fname = "./checkpoints/230510_125444_PPO_P0_C10_60000_steps.zip"
observations = "state"
time_limit = 6

env = gym.make("CatBot-v0", meshcat=meshcat, observations=observations, time_limit=time_limit)
env.simulator.set_target_realtime_rate(8)
model = PPO.load(model_zip_fname, env)

obs = env.reset()
cum_reward = 0
reward_list = []
for i in range(2000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    cum_reward += reward
    if done:
        print('Reward: ', cum_reward)
        reward_list.append(cum_reward)  
        cum_reward = 0
    env.render()
    if done:
        obs = env.reset()
print('Mean reward: ', np.mean(np.array(reward_list)))

Reward:  2473.541793057298
Reward:  1128.546436296576
Reward:  1308.4025816694825
Reward:  2504.9883903866034
Reward:  5083.105965891114
Reward:  1053.4317289536757
Reward:  6354.960053919422
Reward:  6983.356253091172
Reward:  2688.6914615323262
Reward:  656.2204658008205
Reward:  1257.5298387726427
Reward:  1697.7264754314945
Reward:  5831.272018129459
Reward:  1031.3276983618139
Reward:  1853.2552431528152
Reward:  1615.8085806632755
Reward:  2192.0874906606527
Reward:  1110.7882093386124
Reward:  671.9308893641193
Reward:  1740.117018252297
Reward:  1381.3176535785192
Reward:  828.7740888344905
Reward:  1392.7337290301205
Reward:  756.5635911654873
Reward:  714.2167019932075
Reward:  1223.7850320616578
Reward:  1156.8521859882708
Reward:  5855.059925707302
Reward:  479.8191112622319
Reward:  6264.899020945886
Reward:  2808.441348286521
Reward:  939.3983666814305
Mean reward:  2282.46716713315
