In [2]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.a2c import MlpPolicy

env = gym.make('CartPole-v1', render_mode='rgb_array')
model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10_000)
vec_env = model.get_env()
obs = vec_env.reset()
# Uncomment to watch the trained agent
# for i in range(1000):
#     action, _state = model.predict(obs, deterministic=True)
#     obs, reward, done, info = vec_env.step(action)
#     vec_env.render()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23.4     |
|    ep_rew_mean        | 23.4     |
| time/                 |          |
|    fps                | 4672     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.693   |
|    explained_variance | -0.0468  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -1.07    |
|    value_loss         | 20.5     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25.4     |
|    ep_rew_mean        | 25.4     |
| time/                 |          |
|    fps                | 4592     |
|    iterations         | 200      |
|    time_elapsed 

This getting started is provided from the following link: https://github.com/araffin/rl-tutorial-jnrr19/blob/sb3/1_getting_started.ipynb

In [5]:
from stable_baselines3 import PPO
env = gym.make('CartPole-v1', render_mode='rgb_array')
model = PPO('MlpPolicy', env, verbose=0)

> Evaluating the agent using stable-baselines3 built-in function

In [6]:
from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True)
print(f'Mean reward: {mean_reward} +/- {std_reward}')

Mean reward: 10.19 +/- 2.9247051133404893




> Train the agent and evaluate it

In [7]:
# Training an agent
model.learn(total_timesteps=10_000)

<stable_baselines3.ppo.ppo.PPO at 0x12e0c3b60>

In [8]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True)
print(f'Mean reward: {mean_reward} +/- {std_reward}')

Mean reward: 429.77 +/- 96.50076217315592


> Training went well if mean reward increased alot

## Prepare the video recording

In [11]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

## Visualize the trained agent

In [13]:
record_video(env_id='CartPole-v1', model=model, video_length=500, prefix="ppo-cartpole")

Saving video to /Users/geraldamasi/PycharmProjects/RL/stable-baseline-tutorials/videos/ppo-cartpole-step-0-to-step-500.mp4
MoviePy - Building video /Users/geraldamasi/PycharmProjects/RL/stable-baseline-tutorials/videos/ppo-cartpole-step-0-to-step-500.mp4.
MoviePy - Writing video /Users/geraldamasi/PycharmProjects/RL/stable-baseline-tutorials/videos/ppo-cartpole-step-0-to-step-500.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready /Users/geraldamasi/PycharmProjects/RL/stable-baseline-tutorials/videos/ppo-cartpole-step-0-to-step-500.mp4


![](videos/ppo-cartpole-step-0-to-step-500.mp4)

## Train a model with only 1 line of code

In [14]:
model = PPO('MlpPolicy', 'CartPole-v1', verbose=1).learn(total_timesteps=10_000)

Using cpu device
Creating environment from the given name 'CartPole-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.9     |
|    ep_rew_mean     | 22.9     |
| time/              |          |
|    fps             | 10858    |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 27.9        |
|    ep_rew_mean          | 27.9        |
| time/                   |             |
|    fps                  | 7242        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007652224 |
|    clip_fraction        | 0.0738      |
|    cl