# Training Atari Breakout with Stable Baseline 3.

<font color="red"> If GPU cannot be allocated due to high usage, change it to CPU.

While CPU is slower than GPU (about 1~1.5 times), it can still works well. </font>

**Stable Baselines3** is a set of reliable implementations of reinforcement learning algorithms in PyTorch, which makes us easy to build and test RL algorithms! This package contain several RL algorithms, including DQN, A2C, DDPG, PPO, SAC, and TD3, along with useful function and wrappers for RL experiments.

1. Start with installing required packages and importing libraries

In [1]:
!pip install -q gym
!pip install -q stable-baselines3[extra]

import gymnasium as gym
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.env_util import make_atari_env, make_vec_env

import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/178.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m92.2/178.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.7/178.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... [?25l

# Algorithms
1. DQN: Deep Q-Network aims to find the best action to take in a given state by estimating the Q-values. (https://arxiv.org/abs/1312.5602)
2. A2C: Advantage Actor Critic (A2C) is a reinforcement learning algorithm that combines actor and critic networks. It operates synchronously and deterministically, making it easier to understand and implement. Unlike DQN, it doesn't use a replay buffer but employs multiple workers to gather experience. (https://arxiv.org/abs/1602.01783)
3. PPO: Proximal Policy Optimization (PPO) is an algorithm that blends ideas from A2C (using multiple workers) and TRPO (trust region policy optimization). It seeks to optimize policy parameters to improve the actor's performance. PPO uses a trust region to control how much the policy is updated, making it more stable during training.(https://arxiv.org/abs/1707.06347)

2. Select RL algorithm and set hyperparameters.

In [2]:
import os
algorithm = "DQN" # change this to change the algorithm

log_dir = "./DQN_CNN_ALE"
os.makedirs(log_dir, exist_ok=True)
vec_env = make_atari_env("BreakoutDeterministic-v4", n_envs=4) # 4 multiple environments (vector input)
vec_env = VecFrameStack(vec_env, n_stack=4)

if algorithm == "DQN":
  model = DQN("CnnPolicy", vec_env, verbose=1, learning_rate=1e-5, buffer_size=1000000, learning_starts=10000, batch_size=32, gamma=0.99, train_freq=4, target_update_interval=2500, exploration_fraction=0.2, exploration_initial_eps=1.0, exploration_final_eps=0.05)
elif algorithm == "A2C":
  model = A2C("CnnPolicy", vec_env, verbose=1)
elif  algorithm == "PPO":
  model = PPO("CnnPolicy", vec_env, verbose=1)
else:
  raise Exception("not supported algorithm")

  and should_run_async(code)


Using cuda device
Wrapping the env in a VecTransposeImage.




4. Let's begin training!

In [None]:
model.learn(total_timesteps=200000)
model.save("DQN_CNN_ALE_last")

  and should_run_async(code)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
|    episodes         | 15596    |
|    fps              | 175      |
|    time_elapsed     | 801      |
|    total_timesteps  | 141072   |
| train/              |          |
|    learning_rate    | 1e-05    |
|    loss             | 0.0223   |
|    n_updates        | 8191     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 259      |
|    ep_rew_mean      | 2.84     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 15600    |
|    fps              | 175      |
|    time_elapsed     | 801      |
|    total_timesteps  | 141124   |
| train/              |          |
|    learning_rate    | 1e-05    |
|    loss             | 0.0377   |
|    n_updates        | 8195     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 259      |
|    

5. After training, These are functions to record and show videos.

In [None]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv


def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """

    vec_env = make_atari_env(env_id, n_envs=4, seed=0)
    vec_env = VecFrameStack(vec_env, n_stack=4)

    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        vec_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

def show_videos(video_path="", prefix=""):
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
record_video("BreakoutDeterministic-v4", model, video_length=500, prefix="bov4")

show_videos("videos", prefix="bov4")