## Update and install packages

In [2]:
# Update and install display packages and stable baseline 3
# uncomment if running notebook for the first time
!apt-get update && apt-get install swig cmake -y
!apt-get update && apt-get install ffmpeg freeglut3-dev xvfb -y
!pip install box2d-py
!pip install moviepy
!pip install "stable-baselines3[extra]>=2.0.0a4"
!pip install tensorboard


Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [128 kB]      
Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease   
Get:3 http://archive.ubuntu.com/ubuntu focal-updates InRelease [128 kB]
Get:4 http://archive.ubuntu.com/ubuntu focal-backports InRelease [128 kB]
Fetched 383 kB in 3s (110 kB/s)  
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
swig is already the newest version (4.0.1-5build1).
cmake is already the newest version (3.16.3-1ubuntu1.20.04.1).
0 upgraded, 0 newly installed, 0 to remove and 145 not upgraded.
Hit:1 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:2 http://archive.ubuntu.com/ubuntu focal-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu focal-backports InRelease
Hit:4 http://security.ubuntu.com/ubuntu focal-security InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
fr

## Import Libraries

In [3]:
# Import rquired libraries and modules
import os
import signal
import subprocess
import gymnasium as gym
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import torch
from pathlib import Path
import base64
from IPython import display as ipythondisplay
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.logger import configure
import tensorboard


In [4]:
# Check gym and stable baseline 3 versions
print(f"{gym.__version__=}")
print(f"{stable_baselines3.__version__=}")


gym.__version__='0.29.1'
stable_baselines3.__version__='2.4.0a7'


## Settings

### Tunning parameters

In [5]:
# parameters for tunning the model
param_n_stack = 8                
param_clip_range = 0.07
param_learning_rate = 2.5e-4
param_batch_size = 512
param_neurons = 64
param_total_timesteps = 15000000
save_interval = param_total_timesteps/10

### Other settings

In [6]:
# name of the environment to run
env_name = 'BreakoutNoFrameskip-v4'
env_prefix = 'breakout'

# output directoies
base_dir = '.'
output_dir = os.path.join(base_dir, 'output')
env_dir =  os.path.join(output_dir, env_prefix)
logs_dir = os.path.join(env_dir, 'logs')
models_dir = os.path.join(env_dir, 'models')
videos_dir = os.path.join(env_dir, 'videos')

os.makedirs(logs_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)
os.makedirs(videos_dir, exist_ok=True)

# tensorboard name for algorithm logs
tb_log_name = 'PPO'
model_prefix = 'ppo'
model_name_final = f"{model_prefix}_model_final"

print(logs_dir)
print(models_dir)
print(videos_dir)

./output/breakout/logs
./output/breakout/models
./output/breakout/videos


## Callbacks and Directory Setup

In [7]:
# class to manage display
class Display:
    def __init__(self, command: str):
        self.command = command
        
    def start(self):
        self.process = subprocess.Popen(self.command.split())  
        os.environ['DISPLAY'] = ':1'
        
    def terminate(self):
        self.process.terminate()
        
# callback for saving model at regular intervals
class SaveOnIntervalCallback(BaseCallback):
    def __init__(self, save_interval: int, save_path: str, verbose=1):
        super().__init__(verbose)
        self.save_interval = save_interval
        self.save_path = save_path

    def _on_step(self) -> bool:
        # Save the model every 'save_interval' steps
        if self.num_timesteps % self.save_interval == 0:
            save_file = os.path.join(self.save_path, f'{model_prefix}_model_{self.num_timesteps}')
            self.model.save(save_file)
            if self.verbose > 0:
                print(f'Saving model to {save_file}.zip')
        return True


## Custom Feature Extractor

In [8]:
# Feature extraction from frames as observations / states
class CustomANN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=128):
        super(CustomANN, self).__init__(observation_space, features_dim)
        
        # Define your neural network layers
        self.net = nn.Sequential(
            nn.Linear(observation_space.shape[0], 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, features_dim)  # Output dimension should match features_dim
        )
    
    def forward(self, observations):
        return self.net(observations)
    

## Initialise model with atari environemnt

In [9]:
# Initialize the Atari environment with the specified game and configurations
env = make_atari_env(env_name, n_envs=4, seed=0)
# Stack 4 consecutive frames together to provide temporal information
env = VecFrameStack(env, n_stack=param_n_stack) # can fine tunenumber of frame images to learn from. Always more than 1. 4 frames in each state

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [10]:
# Initialize the PPO agent with specified parameters
model = PPO(
    env=env,
    policy='CnnPolicy',
    verbose=1,
    clip_range=param_clip_range,
    ent_coef=0.01,
    learning_rate=param_learning_rate,
    n_epochs=4,
    n_steps=128,
    vf_coef=0.5,
    batch_size=param_batch_size,
    policy_kwargs={'features_extractor_class': CustomCNN},
    tensorboard_log=logs_dir
)

Using cuda device
Wrapping the env in a VecTransposeImage.


## Model training

In [11]:
display = Display("Xvfb :1 -screen 0 1024x768x24")
display.start()

In [None]:
# Define the interval at which models are saved during training
save_callback = SaveOnIntervalCallback(save_interval, models_dir)
model.learn(total_timesteps=param_total_timesteps, progress_bar=True, callback=save_callback)

# Save the final model after training completes
final_model_path = os.path.join(models_dir, model_name_final)
model.save(final_model_path)

The XKEYBOARD keymap compiler (xkbcomp) reports:
> Internal error:   Could not resolve keysym XF86AudioPreset
> Internal error:   Could not resolve keysym XF86MonBrightnessCycle
> Internal error:   Could not resolve keysym XF86WWAN
> Internal error:   Could not resolve keysym XF86RFKill
> Internal error:   Could not resolve keysym XF86Keyboard
> Internal error:   Could not resolve keysym XF86RotationLockToggle
> Internal error:   Could not resolve keysym XF86FullScreen
Errors from xkbcomp are not fatal to the X server


Logging to ./output/breakout/logs/PPO_5


----------------------------
| time/              |     |
|    fps             | 232 |
|    iterations      | 1   |
|    time_elapsed    | 2   |
|    total_timesteps | 512 |
----------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 758           |
|    ep_rew_mean          | 1.33          |
| time/                   |               |
|    fps                  | 263           |
|    iterations           | 2             |
|    time_elapsed         | 3             |
|    total_timesteps      | 1024          |
| train/                  |               |
|    approx_kl            | 5.7660625e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.07          |
|    entropy_loss         | -1.39         |
|    explained_variance   | -0.0331       |
|    learning_rate        | 0.00025       |
|    loss                 | 0.231         |
|    n_updates            | 4             |
|    

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.06e+03     |
|    ep_rew_mean          | 22.4         |
| time/                   |              |
|    fps                  | 326          |
|    iterations           | 2930         |
|    time_elapsed         | 4590         |
|    total_timesteps      | 1500160      |
| train/                  |              |
|    approx_kl            | 0.0016116644 |
|    clip_fraction        | 0.0322       |
|    clip_range           | 0.07         |
|    entropy_loss         | -0.686       |
|    explained_variance   | 0.897        |
|    learning_rate        | 0.00025      |
|    loss                 | 0.00778      |
|    n_updates            | 11716        |
|    policy_gradient_loss | -0.00437     |
|    value_loss           | 0.0491       |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.7e+03      |
|    ep_rew_mean          | 48.2         |
| time/                   |              |
|    fps                  | 327          |
|    iterations           | 5860         |
|    time_elapsed         | 9149         |
|    total_timesteps      | 3000320      |
| train/                  |              |
|    approx_kl            | 0.0040520923 |
|    clip_fraction        | 0.0259       |
|    clip_range           | 0.07         |
|    entropy_loss         | -0.779       |
|    explained_variance   | 0.549        |
|    learning_rate        | 0.00025      |
|    loss                 | 0.0284       |
|    n_updates            | 23436        |
|    policy_gradient_loss | -0.00237     |
|    value_loss           | 0.0852       |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 5.08e+03     |
|    ep_rew_mean          | 60.8         |
| time/                   |              |
|    fps                  | 329          |
|    iterations           | 8790         |
|    time_elapsed         | 13673        |
|    total_timesteps      | 4500480      |
| train/                  |              |
|    approx_kl            | 0.0025059548 |
|    clip_fraction        | 0.0239       |
|    clip_range           | 0.07         |
|    entropy_loss         | -0.863       |
|    explained_variance   | 0.892        |
|    learning_rate        | 0.00025      |
|    loss                 | 0.0131       |
|    n_updates            | 35156        |
|    policy_gradient_loss | -0.00417     |
|    value_loss           | 0.0615       |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

## Results rendering

### Video recording functions

In [None]:
# Functions to record videos of the agent playing and display the videos

def show_videos(video_path="", prefix=""):
    """Displays videos from a specified directory."""
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            '''<video alt="{0}" autoplay
                      loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{1}" type="video/mp4" />
            </video>'''.format(mp4, video_b64.decode('ascii'))
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))


def record_video(env_id, model, video_length=500, prefix="", video_folder=videos_dir):
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode='rgb_array')])
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )
    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)
    eval_env.close()


In [None]:
# Display the video
import os

def get_model_identifiers(models_dir):
    files = os.listdir(models_dir)
    model_files = [f for f in files if f.startswith(f'{model_prefix}_model_')]
    identifiers = [f.split('_')[2] for f in model_files]
    return identifiers

def find_key_identifiers(identifiers):
    identifiers.sort()  # Ensure identifiers are sorted
    earliest = identifiers[0]
    final = identifiers[-1]
    middle = identifiers[len(identifiers) // 2]
    print("earliest, middle, final", earliest, middle, final)
    return earliest, middle, final

def view(models_dir):
    print("MODEL DIR", models_dir)
    identifiers = get_model_identifiers(models_dir)
    earliest, middle, final = find_key_identifiers(identifiers)

    # Record videos at the beginning, middle, and end of training
    for stage, identifier in zip(["beginning", "middle", "end"], [earliest, middle, final]):
        model_path = os.path.join(models_dir, f'{model_prefix}_model_{identifier}')
        print("MODEL PATH", model_path)
        model = PPO.load(model_path)
        record_video(env_name, model, video_length=1000, prefix=f'{model_prefix}-{env_prefix}-{stage}')

    # Display the videos
    for stage in ["beginning", "middle", "end"]:
        show_videos("videos", prefix=f'{model_prefix}-{env_prefix}-{stage}')



### Recording videos

In [None]:
# record video using background display
display = Display("Xvfb :1 -screen 0 1024x768x24")
display.start()
view(models_dir)
display.terminate()

In [None]:
# !tensorboard --logdir {logs_dir} --host 0.0.0.0 --port 6006

In [None]:
# # Function for plotting the learning curve of the agent
# def plot_results(log_folder, title="Learning Curve"):
#     x, y = ts2xy(load_results(log_folder), "timesteps")
#     y = np.convolve(y, np.ones((50,))/50, mode='valid')
#     x = x[len(x) - len(y):]
#     plt.figure(figsize=(10,5))
#     plt.plot(x, y)
#     plt.xlabel("Number of Timesteps")
#     plt.ylabel("Rewards")
#     plt.title(title + " Smoothed")
#     plt.show()

# plot_results(logs_dir)
