In [8]:
import gym

from stable_baselines.common.policies import MlpPolicy,MlpLstmPolicy
from stable_baselines import ACKTR
import tensorflow as tf
from stable_baselines.common.vec_env import DummyVecEnv
from machine import Machine
from GymMachEnv import MachineEnv
from stable_baselines.common.callbacks import EvalCallback
from stable_baselines.common import make_vec_env

# Cartpole

In [5]:
env = make_vec_env('CartPole-v0', n_envs=4)

model = ACKTR(MlpPolicy, env, verbose=1,tensorboard_log="./ACKTR_cartpole_tensorboard/")
model.learn(total_timesteps=25000)
model.save("acktr_cartpole")



---------------------------------
| ep_len_mean        | 13.7     |
| ep_reward_mean     | 13.7     |
| explained_variance | -0.0109  |
| fps                | 140      |
| nupdates           | 1        |
| policy_entropy     | 0.693    |
| policy_loss        | 5.08     |
| total_timesteps    | 80       |
| value_loss         | 75       |
---------------------------------
---------------------------------
| ep_len_mean        | 72.6     |
| ep_reward_mean     | 72.6     |
| explained_variance | 0.000732 |
| fps                | 3176     |
| nupdates           | 100      |
| policy_entropy     | 0.548    |
| policy_loss        | 4.01     |
| total_timesteps    | 8000     |
| value_loss         | 62.5     |
---------------------------------
---------------------------------
| ep_len_mean        | 138      |
| ep_reward_mean     | 138      |
| explained_variance | -0.0042  |
| fps                | 3814     |
| nupdates           | 200      |
| policy_entropy     | 0.592    |
| policy_los

# GMM Model

In [18]:
machine = Machine()
machine.curr_state = 0
env = DummyVecEnv([lambda: MachineEnv(machine)])
machine2 = Machine()
machine2.curr_state = 0
eval_env = DummyVecEnv([lambda: MachineEnv(machine2)])
eval_callback = EvalCallback(eval_env, best_model_save_path='./best/trial2',
                             log_path='./best/trial2', eval_freq=500,
                             deterministic=True, render=False)
#model = ACKTR('MlpPolicy',env,verbose=1, tensorboard_log="./ACKTR_GMM_tensorboard/").learn(total_timesteps=100000,callback=eval_callback)



In [16]:
policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=[dict(vf=[128,64], pi=[128,64])]) #no shared layer, actor and critic have individual NN
eval_callback2 = EvalCallback(eval_env, eval_freq=500,
                             deterministic=True, render=False)
model2 = ACKTR('MlpPolicy',env,verbose=1,policy_kwargs=policy_kwargs).learn(total_timesteps=100000,callback=eval_callback2)

----------------------------------
| explained_variance | -5.36e-06 |
| fps                | 36        |
| nupdates           | 1         |
| policy_entropy     | 0.693     |
| policy_loss        | 494       |
| total_timesteps    | 20        |
| value_loss         | 4.02e+06  |
----------------------------------
Eval num_timesteps=500, episode_reward=3800.00 +/- 2638.18
Episode length: 82.00 +/- 10.20
New best mean reward!
Eval num_timesteps=1000, episode_reward=-8100.00 +/- 7838.37
Episode length: 60.60 +/- 37.35
Eval num_timesteps=1500, episode_reward=6000.00 +/- 4940.04
Episode length: 11.40 +/- 6.28
New best mean reward!
Eval num_timesteps=2000, episode_reward=3580.00 +/- 2442.46
Episode length: 9.40 +/- 3.38
---------------------------------
| explained_variance | 0.000151 |
| fps                | 203      |
| nupdates           | 100      |
| policy_entropy     | 0.574    |
| policy_loss        | 1.08e+03 |
| total_timesteps    | 2000     |
| value_loss         | 1.92e+07 |
----

Eval num_timesteps=24500, episode_reward=6680.00 +/- 5322.93
Episode length: 14.80 +/- 7.93
Eval num_timesteps=25000, episode_reward=3580.00 +/- 2147.93
Episode length: 9.20 +/- 2.79
Eval num_timesteps=25500, episode_reward=10700.00 +/- 9046.10
Episode length: 17.00 +/- 9.61
Eval num_timesteps=26000, episode_reward=5720.00 +/- 2263.98
Episode length: 12.40 +/- 3.56
---------------------------------
| explained_variance | 0.00166  |
| fps                | 259      |
| nupdates           | 1300     |
| policy_entropy     | 0.598    |
| policy_loss        | 1.87e+03 |
| total_timesteps    | 26000    |
| value_loss         | 2.24e+07 |
---------------------------------
Eval num_timesteps=26500, episode_reward=5400.00 +/- 3824.13
Episode length: 13.00 +/- 7.67
Eval num_timesteps=27000, episode_reward=2980.00 +/- 1030.34
Episode length: 8.60 +/- 1.85
Eval num_timesteps=27500, episode_reward=16900.00 +/- 8388.80
Episode length: 66.80 +/- 32.99
Eval num_timesteps=28000, episode_reward=24400.00

Eval num_timesteps=48500, episode_reward=42120.00 +/- 21441.12
Episode length: 69.20 +/- 31.98
Eval num_timesteps=49000, episode_reward=27460.00 +/- 21754.14
Episode length: 46.20 +/- 34.84
Eval num_timesteps=49500, episode_reward=19320.00 +/- 18116.22
Episode length: 35.40 +/- 28.93
Eval num_timesteps=50000, episode_reward=32720.00 +/- 19069.07
Episode length: 55.80 +/- 32.27
---------------------------------
| explained_variance | 0.00791  |
| fps                | 244      |
| nupdates           | 2500     |
| policy_entropy     | 0.456    |
| policy_loss        | 2.38e+03 |
| total_timesteps    | 50000    |
| value_loss         | 3.14e+07 |
---------------------------------
Eval num_timesteps=50500, episode_reward=24960.00 +/- 26461.94
Episode length: 46.60 +/- 41.82
Eval num_timesteps=51000, episode_reward=37800.00 +/- 20264.55
Episode length: 68.40 +/- 33.77
Eval num_timesteps=51500, episode_reward=31160.00 +/- 22204.47
Episode length: 52.80 +/- 38.26
Eval num_timesteps=52000, epi

Eval num_timesteps=72500, episode_reward=15520.00 +/- 18625.72
Episode length: 27.20 +/- 26.72
Eval num_timesteps=73000, episode_reward=11220.00 +/- 11435.80
Episode length: 20.80 +/- 17.86
Eval num_timesteps=73500, episode_reward=20480.00 +/- 12835.17
Episode length: 35.20 +/- 19.08
Eval num_timesteps=74000, episode_reward=19540.00 +/- 13926.32
Episode length: 35.80 +/- 22.79
---------------------------------
| explained_variance | 0.00357  |
| fps                | 227      |
| nupdates           | 3700     |
| policy_entropy     | 0.276    |
| policy_loss        | 533      |
| total_timesteps    | 74000    |
| value_loss         | 8.54e+06 |
---------------------------------
Eval num_timesteps=74500, episode_reward=35920.00 +/- 23099.04
Episode length: 60.60 +/- 34.81
Eval num_timesteps=75000, episode_reward=31560.00 +/- 22288.71
Episode length: 57.80 +/- 41.47
Eval num_timesteps=75500, episode_reward=32480.00 +/- 18574.00
Episode length: 54.80 +/- 30.37
Eval num_timesteps=76000, epi

Eval num_timesteps=96500, episode_reward=31160.00 +/- 19014.90
Episode length: 57.60 +/- 31.54
Eval num_timesteps=97000, episode_reward=36620.00 +/- 22728.26
Episode length: 62.40 +/- 28.54
Eval num_timesteps=97500, episode_reward=19340.00 +/- 15981.81
Episode length: 38.80 +/- 26.82
Eval num_timesteps=98000, episode_reward=8900.00 +/- 10356.06
Episode length: 17.20 +/- 15.60
---------------------------------
| explained_variance | 0.00061  |
| fps                | 204      |
| nupdates           | 4900     |
| policy_entropy     | 0.321    |
| policy_loss        | 1.71e+03 |
| total_timesteps    | 98000    |
| value_loss         | 4.94e+07 |
---------------------------------
Eval num_timesteps=98500, episode_reward=21540.00 +/- 8188.19
Episode length: 38.80 +/- 15.41
Eval num_timesteps=99000, episode_reward=30680.00 +/- 19155.72
Episode length: 55.60 +/- 32.20
Eval num_timesteps=99500, episode_reward=20900.00 +/- 15331.27
Episode length: 39.00 +/- 31.32
Eval num_timesteps=100000, epis

In [17]:
model2.params

[<tf.Variable 'model/pi_fc0/w:0' shape=(4, 128) dtype=float32_ref>,
 <tf.Variable 'model/pi_fc0/b:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc0/w:0' shape=(4, 128) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc0/b:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'model/pi_fc1/w:0' shape=(128, 64) dtype=float32_ref>,
 <tf.Variable 'model/pi_fc1/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc1/w:0' shape=(128, 64) dtype=float32_ref>,
 <tf.Variable 'model/vf_fc1/b:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'model/vf/w:0' shape=(64, 1) dtype=float32_ref>,
 <tf.Variable 'model/vf/b:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'model/pi/w:0' shape=(64, 2) dtype=float32_ref>,
 <tf.Variable 'model/pi/b:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'model/q/w:0' shape=(64, 2) dtype=float32_ref>,
 <tf.Variable 'model/q/b:0' shape=(2,) dtype=float32_ref>]