In [None]:
import os
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv
from stable_baselines3.common.vec_env import SubprocVecEnv, VecMonitor
from stable_baselines3.common.callbacks import CheckpointCallback
from dino_game_env import DinoGameEnv

## 1. Setup Training Environment

In [None]:
N_ENVS = 5

LOGS_DIR = "./logs/"
MODELS_DIR = "./models/"
os.makedirs(LOGS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)


def make_env(vec_env_id):
  env = DinoGameEnv(vec_env_id, headless=False)
  env = MaxAndSkipEnv(env, skip=2) # skip some frames to limit the number of actions per second (makes training faster)
  return env

def make_vec_env(num_envs):
  def _init_env():
    return make_env(vec_env_id=np.random.randint(0, 10000))
  return SubprocVecEnv([_init_env for _ in range(num_envs)])

training_env = VecMonitor(make_vec_env(N_ENVS), filename=LOGS_DIR)

## 2. Create callback to save the best models

In [None]:
# from stable_baselines3.common.callbacks import BaseCallback
# from stable_baselines3.common.results_plotter import load_results, ts2xy

# class SaveBestModelsCallback(BaseCallback):
#   def __init__(self, check_freq: int, log_dir: str, models_dir: str, verbose: int = 1):
#     super(SaveBestModelsCallback, self).__init__(verbose)
#     self.check_freq = check_freq
#     self.log_dir = log_dir
#     self.models_dir = models_dir
#     self.best_mean_reward = -np.inf

#   def _on_step(self) -> bool:
#     if self.n_calls % self.check_freq == 0:
#       # Retrieve training reward
#       x, y = ts2xy(load_results(self.log_dir), 'timesteps')
#       if len(x) > 0:
#         # Mean training reward over the last 500 episodes
#         mean_reward = np.mean(y[-500:])
#         print(f"Num timesteps: {self.num_timesteps}")
#         print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

#         # New best model, you could save the agent here
#         if mean_reward > self.best_mean_reward:
#           self.best_mean_reward = mean_reward
#           # Example for saving best model
#           best_model = f"best_model_{self.num_timesteps}"
#           print(f"Saving new best model to {self.models_dir}{best_model}")
#           self.model.save(self.models_dir + best_model)
#     return True
  
# callback = SaveBestModelsCallback(check_freq=5000, log_dir=LOGS_DIR, models_dir=MODELS_DIR)

callback = CheckpointCallback(
  save_freq=20000/N_ENVS,
  save_path=MODELS_DIR,
  name_prefix="best_model"
)

## 3. Create the PPO agent and start training

In [None]:
# ppo_agent = PPO("MlpPolicy", env=training_env, learning_rate=0.00003,verbose=1, n_steps=512, tensorboard_log=LOGS_DIR, device="cuda")
ppo_agent = PPO.load(path="./models/best_model_640000_steps.zip", env=training_env)

ppo_agent.learn(total_timesteps=2e6, callback=callback, tb_log_name="PPO_0-00003", reset_num_timesteps=False)

training_env.close()

In [None]:
%tensorboard --logdir logs/