In [1]:
import utils
import rl
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.utils import get_linear_fn
from stable_baselines3 import PPO

In [2]:
vec_env = make_atari_env(env_id="PongNoFrameskip-v4", n_envs=16, seed=1)

In [4]:
lr_schedule = get_linear_fn(2.5e-4, 1.25e-4, 1.0)

model = PPO(
    "CnnPolicy",
    vec_env,
    learning_rate=lr_schedule, #
    n_steps=128, #
    batch_size=256, #
    n_epochs=3, #
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    clip_range_vf=None,
    normalize_advantage=True,
    ent_coef=0.01, #
    vf_coef=0.5,
    max_grad_norm=0.5,
    tensorboard_log="C:/Users/cgoet/PycharmProjects/Pong-RL/logs/ppo/",
    policy_kwargs=dict(features_extractor_kwargs={"features_dim": 512}),
    device="cuda",
    seed=1,
)

In [5]:
utils.print_model_parameters(model, shared_extractor=True)

features_extractor: 1,677,984
pi_features_extractor: 1,677,984
vf_features_extractor: 1,677,984
mlp_extractor: 0
action_net: 3,078
value_net: 513
Total number of parameters: 1,681,575


In [6]:
# tensorboard --logdir="C:/Users/cgoet/PycharmProjects/Pong-RL/logs"
model.learn(total_timesteps=5_000_000, tb_log_name="ppo_nature_cnn_v1", reset_num_timesteps=True)

<stable_baselines3.ppo.ppo.PPO at 0x22e17896ad0>

In [10]:
# model.learn(total_timesteps=1_000_000, tb_log_name="dqn_nature_cnn_v1", reset_num_timesteps=False)

<stable_baselines3.dqn.dqn.DQN at 0x26073b4d360>

In [7]:
model.save("C:/Users/cgoet/PycharmProjects/Pong-RL/models/ppo/ppo_nature_cnn_v1_best")

In [8]:
model = PPO.load("C:/Users/cgoet/PycharmProjects/Pong-RL/models/ppo/ppo_nature_cnn_v1_best")

In [9]:
mean_reward, std_reward = rl.evaluate(model, vec_env, episodes=16, deterministic=True) # PPO

Mean reward: 20.69 +/- 0.46


In [10]:
mean_reward, std_reward = rl.evaluate(model, vec_env, episodes=16, deterministic=False) # PPO

Mean reward: 20.50 +/- 0.50
