In [1]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3 import PPO

from gym.wrappers import GrayScaleObservation

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
import numpy as np
import os
from stable_baselines3.common.callbacks import BaseCallback

env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)


log_dir = './monitor_log/'
os.makedirs(log_dir, exist_ok=True)

env = Monitor(env, log_dir)

env = GrayScaleObservation(env,keep_dim=True)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env,4,channels_order='last')

In [2]:
class SaveOnStepCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq, save_path, verbose=1):
        super(SaveOnStepCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = os.path.join(save_path, 'best_model')
        

    def _init_callback(self):
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            self.model.save(os.path.join(self.save_path,'_{}'.format(self.n_calls)))
        
        return True

In [3]:
learning_rate = 1e-6
n_steps = 2048


tensorboard_log = r'./tensorboard_logs/'
model = PPO("CnnPolicy", env, verbose=1,
            learning_rate=learning_rate,n_steps=n_steps,
            tensorboard_log=tensorboard_log)

save_path=r"F:\\RL_Mario\\"
callback1=SaveOnStepCallback(check_freq=20000,save_path=save_path)
model.learn(total_timesteps=5000000,callback=callback1)

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./tensorboard_logs/PPO_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


-----------------------------
| time/              |      |
|    fps             | 93   |
|    iterations      | 1    |
|    time_elapsed    | 21   |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 72            |
|    iterations           | 2             |
|    time_elapsed         | 56            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00010179842 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.95         |
|    explained_variance   | -0.00362      |
|    learning_rate        | 1e-06         |
|    loss                 | 79.4          |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.00037      |
|    value_loss           | 144           |
------------------------------------------

KeyboardInterrupt: 