# Training using Stable Baselines for Shimmy DM Control

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

!apt-get install -y libx11-dev
!apt-get install -y build-essential

In [None]:
%pip install gymnasium
%pip install shimmy[dm-control]
%pip install comet_ml
%pip install free-mujoco-py
%pip install stable-baselines3[extra]

In [1]:
from __future__ import annotations

import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.distributions.normal import Normal

import gymnasium as gym

from comet_ml import Experiment
from comet_ml.integration.gymnasium import CometLogger

from stable_baselines3 import PPO
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.evaluation import evaluate_policy

plt.rcParams["figure.figsize"] = (10, 5)

  from distutils.dep_util import newer, newer_group
  from distutils.dep_util import newer, newer_group
  from pkg_resources import resource_stream, resource_exists


pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
total_timesteps = 10000000
n_steps = 5000
learning_rate = 0.02
batch_size = 512
gamma = 0.99

In [3]:
experiment = Experiment(
    api_key="eDVXm91zIoTyF8BUArQquxAmM",
    project_name="deeprl",
    workspace="clandolt",
)

# Create and wrap the environment
env = gym.make("HumanoidStandup-v4", render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, 'test')
env = CometLogger(env, experiment)
observation, info = env.reset(seed=42)

model = PPO("MlpPolicy", env, verbose=1, n_steps=n_steps, learning_rate=learning_rate, batch_size=batch_size, gamma=gamma)
model.learn(total_timesteps=total_timesteps, reset_num_timesteps=False)
# Save the agent
model.save("ppo_humanoid")
vec_env = model.get_env()

del model  # delete trained model to demonstrate loading

# Load the trained agent
# NOTE: if you have loading issue, you can pass `print_system_info=True`
# to compare the system on which the model was trained vs the current one
# model = DQN.load("dqn_lunar", env=env, print_system_info=True)
model = PPO.load("ppo_humanoid", env=env)

# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

# Enjoy trained agent
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(100000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/clandolt/deeprl/dda87f3b617b4330917116315dace27c

  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4000 and n_envs=1)


Moviepy - Building video /home/landolt/DeepRL/test/rl-video-episode-0.mp4.
Moviepy - Writing video /home/landolt/DeepRL/test/rl-video-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /home/landolt/DeepRL/test/rl-video-episode-0.mp4
Moviepy - Building video /home/landolt/DeepRL/test/rl-video-episode-1.mp4.
Moviepy - Writing video /home/landolt/DeepRL/test/rl-video-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /home/landolt/DeepRL/test/rl-video-episode-1.mp4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 3.63e+04 |
| time/              |          |
|    fps             | 307      |
|    iterations      | 1        |
|    time_elapsed    | 13       |
|    total_timesteps | 4000     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 3.79e+04    |
| time/                   |             |
|    fps                  | 458         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 8000        |
| train/                  |             |
|    approx_kl            | 0.024064675 |
|    clip_fraction        | 0.281       |
|    clip_range           | 0.2         |
|    entropy_loss         

                                                                 

Moviepy - Done !
Moviepy - video ready /home/landolt/DeepRL/test/rl-video-episode-8.mp4
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 3.84e+04    |
| time/                   |             |
|    fps                  | 459         |
|    iterations           | 3           |
|    time_elapsed         | 26          |
|    total_timesteps      | 12000       |
| train/                  |             |
|    approx_kl            | 0.019676127 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.2         |
|    entropy_loss         | -24.1       |
|    explained_variance   | -0.0063     |
|    learning_rate        | 0.002       |
|    loss                 | 2e+05       |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0546     |
|    std                  | 0.999       |
|    value_loss           | 4.28e+05    |
------------------------------

                                                                

Moviepy - Done !
Moviepy - video ready /home/landolt/DeepRL/test/rl-video-episode-27.mp4
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 4.05e+04   |
| time/                   |            |
|    fps                  | 584        |
|    iterations           | 7          |
|    time_elapsed         | 47         |
|    total_timesteps      | 28000      |
| train/                  |            |
|    approx_kl            | 0.02964586 |
|    clip_fraction        | 0.306      |
|    clip_range           | 0.2        |
|    entropy_loss         | -24        |
|    explained_variance   | -0.00261   |
|    learning_rate        | 0.002      |
|    loss                 | 2.07e+05   |
|    n_updates            | 60         |
|    policy_gradient_loss | -0.0717    |
|    std                  | 0.991      |
|    value_loss           | 4.65e+05   |
----------------------------------------
---------

                                                                

Moviepy - Done !
Moviepy - video ready /home/landolt/DeepRL/test/rl-video-episode-64.mp4
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 4.7e+04     |
| time/                   |             |
|    fps                  | 710         |
|    iterations           | 17          |
|    time_elapsed         | 95          |
|    total_timesteps      | 68000       |
| train/                  |             |
|    approx_kl            | 0.044175055 |
|    clip_fraction        | 0.373       |
|    clip_range           | 0.2         |
|    entropy_loss         | -23.6       |
|    explained_variance   | -8.39e-05   |
|    learning_rate        | 0.002       |
|    loss                 | 4.29e+05    |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.0681     |
|    std                  | 0.968       |
|    value_loss           | 9.21e+05    |
-----------------------------

                                                                

Moviepy - Done !
Moviepy - video ready /home/landolt/DeepRL/test/rl-video-episode-125.mp4
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 6.09e+04   |
| time/                   |            |
|    fps                  | 774        |
|    iterations           | 32         |
|    time_elapsed         | 165        |
|    total_timesteps      | 128000     |
| train/                  |            |
|    approx_kl            | 0.07193029 |
|    clip_fraction        | 0.443      |
|    clip_range           | 0.2        |
|    entropy_loss         | -23.1      |
|    explained_variance   | 0.000191   |
|    learning_rate        | 0.002      |
|    loss                 | 7e+05      |
|    n_updates            | 310        |
|    policy_gradient_loss | -0.0717    |
|    std                  | 0.944      |
|    value_loss           | 1.3e+06    |
----------------------------------------
--------

KeyboardInterrupt: 

In [None]:
experiment.end()

In [None]:
experiment.display()