In [None]:
import os
import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines.common.policies import CnnPolicy
from stable_baselines import PPO2
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines.common.evaluation import evaluate_policy

In [None]:
# There already exists an environment generator
# that will make and wrap atari environments correctly.
# Here we are also multiprocessing training (num_env=4 => 4 processes)
env = make_atari_env('PongNoFrameskip-v4', num_env=1, seed=0,  wrapper_kwargs = {"frame_stack": True})

# OR 
#env = make_atari_env('PongNoFrameskip-v4', num_env=1, seed=0)
# Frame-stacking with 4 frames
#env = VecFrameStack(env, n_stack=4)

model = PPO2(CnnPolicy, env, verbose=1, tensorboard_log="ppo2_atari")

In [None]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

In [None]:
mean_reward, std_reward

In [None]:
# Train the agent
time_steps = 10000000
model.learn(total_timesteps=time_steps)

In [None]:
# Enjoy trained agent
eval_env = make_atari_env('PongNoFrameskip-v4', num_env=1, seed=0,  wrapper_kwargs = {"frame_stack": True})
obs = eval_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, info = eval_env.step(action)
    eval_env.render()
    if done:
        obs = eval_env.reset()
eval_env.close()