# Imports

In [15]:
import os
import gym
from stable_baselines3 import PPO, DQN, A2C, DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Load Environment

In [2]:
env_name = 'Pendulum-v0'
env = gym.make(env_name)

We need to examine the action space in order to determine what algorithm to use.

In [3]:
env.action_space

Box([-2.], [2.], (1,), float32)

# Simple Rendering

In [4]:
episodes = 5
for episode in range(1, episodes + 1):
    # resets state to its initial values
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        # renders the actual environment
        env.render()
        # generates random action
        action = env.action_space.sample()
        # env.step outputs 4 values that corresponds to our variables below
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:-964.7152960205018
Episode:2 Score:-1314.31514106845
Episode:3 Score:-960.2553032412387
Episode:4 Score:-1284.6480820828795
Episode:5 Score:-920.8666316571449


# Modeling

## Model 1: PPO

In [6]:
log_path = os.path.join('Training', 'Logs')

In [7]:
env = gym.make(env_name)
# wraps vectorized environment around env
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cuda device


In [8]:
model.learn(total_timesteps=100_000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 194  |
|    iterations      | 1    |
|    time_elapsed    | 10   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 268          |
|    iterations           | 2            |
|    time_elapsed         | 15           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0031244028 |
|    clip_fraction        | 0.0174       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | -0.0046      |
|    learning_rate        | 0.0003       |
|    loss                 | 3.15e+03     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00148     |
|    std                  | 0.997        |
|    value_loss           | 

------------------------------------------
| time/                   |              |
|    fps                  | 398          |
|    iterations           | 12           |
|    time_elapsed         | 61           |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0023760684 |
|    clip_fraction        | 0.01         |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.36        |
|    explained_variance   | 0.000269     |
|    learning_rate        | 0.0003       |
|    loss                 | 2.64e+03     |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.000728    |
|    std                  | 0.943        |
|    value_loss           | 6.04e+03     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 401          |
|    iterations           | 13           |
|    time_e

------------------------------------------
| time/                   |              |
|    fps                  | 414          |
|    iterations           | 23           |
|    time_elapsed         | 113          |
|    total_timesteps      | 47104        |
| train/                  |              |
|    approx_kl            | 0.0020038595 |
|    clip_fraction        | 0.00801      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.3         |
|    explained_variance   | 2.29e-05     |
|    learning_rate        | 0.0003       |
|    loss                 | 1.97e+03     |
|    n_updates            | 220          |
|    policy_gradient_loss | -0.000776    |
|    std                  | 0.883        |
|    value_loss           | 4.99e+03     |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 415         |
|    iterations           | 24          |
|    time_elaps

------------------------------------------
| time/                   |              |
|    fps                  | 412          |
|    iterations           | 34           |
|    time_elapsed         | 169          |
|    total_timesteps      | 69632        |
| train/                  |              |
|    approx_kl            | 0.0039681373 |
|    clip_fraction        | 0.0219       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.2         |
|    explained_variance   | 2.68e-06     |
|    learning_rate        | 0.0003       |
|    loss                 | 1.36e+03     |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.00165     |
|    std                  | 0.804        |
|    value_loss           | 3.96e+03     |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 411         |
|    iterations           | 35          |
|    time_elaps

-----------------------------------------
| time/                   |             |
|    fps                  | 421         |
|    iterations           | 45          |
|    time_elapsed         | 218         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.003076877 |
|    clip_fraction        | 0.00884     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.2        |
|    explained_variance   | 0.0461      |
|    learning_rate        | 0.0003      |
|    loss                 | 1.84e+03    |
|    n_updates            | 440         |
|    policy_gradient_loss | -0.000378   |
|    std                  | 0.802       |
|    value_loss           | 4.49e+03    |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 421         |
|    iterations           | 46          |
|    time_elapsed         | 223   

<stable_baselines3.ppo.ppo.PPO at 0x222c5365b80>

In [9]:
# PPO_Path = os.path.join('Training', 'Saved Models', 'ppo_pendulum')
# model.save(PPO_Path)



### Evaluation

The mean score of -1073 doesn't seem to good. We'll traing for longer.

In [10]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)



(-1073.8983472109771, 74.81807268782887)

In [11]:
env.close()

## Model 2: A2C

In [12]:
model2 = A2C('MlpPolicy', env, verbose = 0, tensorboard_log = log_path)
model2.learn(total_timesteps=100_000)

<stable_baselines3.a2c.a2c.A2C at 0x22308e3f310>

### Evaluation

Not quite pleased as we had expected the reward scores to improve. We'll try a different model.

In [13]:
evaluate_policy(model2, env, n_eval_episodes=10, render = True)

(-1222.2364308070391, 142.59367841533705)

In [26]:
path2 = os.path.join('Training', 'Saved Models', 'a2c_pendulum')
model2.save(path2)

## Model 3: DDPG

In [16]:
model3 = DDPG('MlpPolicy', env, verbose = 0, tensorboard_log = log_path)
model3.learn(total_timesteps=100_000)

<stable_baselines3.ddpg.ddpg.DDPG at 0x223090977f0>

### Evaluation

Tremendous improvement! The Q learning perhaps is better for the continuous movement of the pendulum. We'll train some more.

In [17]:
evaluate_policy(model3, env, n_eval_episodes=10, render = True)

(-151.64272798357996, 69.64372933655484)

In [22]:
path1 = os.path.join('Training', 'Saved Models', 'DDPG_pend')
model3.save(path1)

In [18]:
env.close()

## Model 4: DDPG, 500k timesteps

In [19]:
%%time
model4 = DDPG('MlpPolicy', env, verbose = 0, tensorboard_log = log_path)
model4.learn(total_timesteps=500_000)

Wall time: 54min 38s


<stable_baselines3.ddpg.ddpg.DDPG at 0x2237a55dac0>

### Evaluation

Not what we had hoped but the scores don't seem too far off our previous. A deeper implementation would need to be looked at although the pendulum does stay up.

In [20]:
evaluate_policy(model4, env, n_eval_episodes=10, render = True)

(-175.39949680198916, 56.19109718256043)

In [21]:
env.close()

In [27]:
# path4 = os.path.join('Training', 'Saved Models', 'ddpg_pendulum500k')
# model4.save(path4)

In [23]:
# path2 = os.path.join('Training', 'Saved Models', 'DDPG_pend_500k')
# model4.save(path2)