# Imports

In [11]:
import os
import gym
from gym.wrappers import Monitor
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Load Environment

In [22]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

# Basic Rendering

We want to see what scores would result from randomly sampled moves.

In [4]:
episodes = 5
for episode in range(1, episodes + 1):
    # resets state to its initial values
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        # renders the actual environment
        env.render()
        # generates random action
        action = env.action_space.sample()
        # env.step outputs 4 values that corresponds to our variables below
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:37.0
Episode:2 Score:21.0
Episode:3 Score:13.0
Episode:4 Score:14.0
Episode:5 Score:12.0


We need to see the action space as it determines what algorithm to use.

In [5]:
env.action_space

Discrete(2)

In [6]:
log_path = os.path.join('Training', 'Logs')

env = gym.make(env_name)
# wraps vectorized environment around env
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 0, tensorboard_log = log_path)

In [7]:
model.learn(total_timesteps=100_000)

<stable_baselines3.ppo.ppo.PPO at 0x23bad1c0a60>

In [8]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)



(200.0, 0.0)

In [9]:
env.close()

In [10]:
path = os.path.join('Training', 'Saved Models', 'ppo_cartpole')
model.save(path)