In [1]:


from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3 import PPO

from gym.wrappers import GrayScaleObservation

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
import numpy as np
import os
from stable_baselines3.common.callbacks import BaseCallback

import optuna

from stable_baselines3.common.evaluation import evaluate_policy

import os


env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)



log_dir = './log_dir2/'
os.makedirs(log_dir, exist_ok=True)

env = Monitor(env, log_dir)

env = GrayScaleObservation(env,keep_dim=True)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env,4,channels_order='last')



# PPO主要超参数

In [2]:
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

# 超参数调优

In [3]:
def optimize_agent(trial):
    
    try:
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = JoypadSpace(env, SIMPLE_MOVEMENT)

        log_dir = './log_dir2/'
        os.makedirs(log_dir, exist_ok=True)

        env = Monitor(env, log_dir)

        env = GrayScaleObservation(env,keep_dim=True)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env,4,channels_order='last')

    
        model_params = optimize_ppo(trial) 
    

        tensorboard_log = r'./logs/'
        model = PPO("CnnPolicy", env, verbose=0,tensorboard_log=tensorboard_log,**model_params)
        # model.learn(total_timesteps=1000)
        model.learn(total_timesteps=200000)
    
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    
    
        env.close()
    
        OPT_DIR  = r'F:\\300_RL_DEMO\\220_Super-Mario-2\\'
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
    
        return mean_reward    

    except Exception as e:
        return -1000
    
    



In [4]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=100)

[32m[I 2022-03-05 18:34:28,442][0m A new study created in memory with name: no-name-f86b12cb-1ae9-42ed-99ea-686b25b058bb[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4613 and n_envs=1)
  return (self.ram[0x86] - self.ram[0x071c]) % 256
[32m[I 2022-03-05 19:29:59,963][0m Trial 0 finished with value: 737.0 and parameters: {'n_steps': 4613, 'gamma': 0.897637642087022, 'learning_rate': 1.132279703505345e-05, 'clip_range': 0.24905728408479416, 'gae_lambda': 0.9265479662259783}. Best is trial 0 with value: 737.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6897 and n_envs=1)
  return (self.ram[0x86] - self.ram[0x071c]) % 256
[32m[I 2022-03-05 20:26:14,200][0m Trial 1 finished with value: 741.0 and parameters: {'n_steps': 6897, 'gamma': 0.8827508549180133, 'learning_rate': 1.662967178786175e-05, 'clip_range': 0.21117729231503277, 'gae_lambda': 0.9468593404751742}. Best is trial 1 with value:

In [5]:
dir(study)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_ask',
 '_is_multi_objective',
 '_log_completed_trial',
 '_optimize_lock',
 '_pop_waiting_trial_id',
 '_stop_flag',
 '_storage',
 '_study_id',
 '_tell',
 'add_trial',
 'add_trials',
 'ask',
 'best_params',
 'best_trial',
 'best_trials',
 'best_value',
 'direction',
 'directions',
 'enqueue_trial',
 'get_trials',
 'optimize',
 'pruner',
 'sampler',
 'set_system_attr',
 'set_user_attr',
 'stop',
 'study_name',
 'system_attrs',
 'tell',
 'trials',
 'trials_dataframe',
 'user_attrs']

In [6]:
study.best_params

{'n_steps': 7149,
 'gamma': 0.8692871366327747,
 'learning_rate': 6.442559213980066e-05,
 'clip_range': 0.31688308594665404,
 'gae_lambda': 0.8710254680014865}

In [7]:
study.best_trial

FrozenTrial(number=14, values=[2567.0], datetime_start=datetime.datetime(2022, 3, 6, 3, 8, 43, 981845), datetime_complete=datetime.datetime(2022, 3, 6, 4, 4, 57, 152303), params={'n_steps': 7149, 'gamma': 0.8692871366327747, 'learning_rate': 6.442559213980066e-05, 'clip_range': 0.31688308594665404, 'gae_lambda': 0.8710254680014865}, distributions={'n_steps': IntUniformDistribution(high=8192, low=2048, step=1), 'gamma': LogUniformDistribution(high=0.9999, low=0.8), 'learning_rate': LogUniformDistribution(high=0.0001, low=1e-05), 'clip_range': UniformDistribution(high=0.4, low=0.1), 'gae_lambda': UniformDistribution(high=0.99, low=0.8)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=14, state=TrialState.COMPLETE, value=None)