In [None]:
import os

import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common import set_global_seeds
from stable_baselines.bench import Monitor

In [None]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param rank: (int) index of the subprocess
    :param seed: (int) the inital seed for RNG
    """
    def _init():
        
        # Create log dir
        log_dir = "tmp"+str(rank)+"/"
        os.makedirs(log_dir, exist_ok=True)
        env = gym.make(env_id)
        env = Monitor(env, log_dir)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init

In [None]:
env_id = "CartPole-v1"
num_cpu = 4  # Number of processes to use

# Create the vectorized environment
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

model = PPO2(MlpPolicy, env, verbose=1)

model.learn(total_timesteps=25000)

In [None]:
# Enjoy trained agent
eval_env = gym.make(env_id)
obs = eval_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, info = eval_env.step(action)
    eval_env.render(mode="human")
    if done:
        obs = eval_env.reset()
eval_env.close()