In [9]:
import gym

import numpy as np

# Create the environment and reset it to the initial state
env = gym.make("CartPole-v1")
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [12]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Flatten

from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=50000, window_length=1)

In [13]:
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.dqn import DQNAgent

dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...


  updates=self.state_updates,


    65/50000: episode: 1, duration: 0.289s, episode steps:  65, steps per second: 225, episode reward: 65.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.523 [0.000, 1.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,


   101/50000: episode: 2, duration: 1.426s, episode steps:  36, steps per second:  25, episode reward: 36.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.528 [0.000, 1.000],  loss: --, mae: --, mean_q: --
   133/50000: episode: 3, duration: 0.299s, episode steps:  32, steps per second: 107, episode reward: 32.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.594 [0.000, 1.000],  loss: 0.420629, mae: 0.549729, mean_q: 0.201939
   147/50000: episode: 4, duration: 0.131s, episode steps:  14, steps per second: 107, episode reward: 14.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 0.245270, mae: 0.555674, mean_q: 0.469608
   166/50000: episode: 5, duration: 0.184s, episode steps:  19, steps per second: 103, episode reward: 19.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.632 [0.000, 1.000],  loss: 0.122260, mae: 0.595050, mean_q: 0.810366
   199/50000: episode: 6, duration: 0.300s, episode steps:  33, steps per second: 110,

  1108/50000: episode: 38, duration: 0.333s, episode steps:  37, steps per second: 111, episode reward: 37.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.486 [0.000, 1.000],  loss: 0.470347, mae: 4.212126, mean_q: 8.100561
  1147/50000: episode: 39, duration: 0.352s, episode steps:  39, steps per second: 111, episode reward: 39.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.410 [0.000, 1.000],  loss: 0.468725, mae: 4.376017, mean_q: 8.456325
  1224/50000: episode: 40, duration: 0.847s, episode steps:  77, steps per second:  91, episode reward: 77.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.506 [0.000, 1.000],  loss: 0.403932, mae: 4.626245, mean_q: 9.024426
  1292/50000: episode: 41, duration: 0.639s, episode steps:  68, steps per second: 106, episode reward: 68.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.471 [0.000, 1.000],  loss: 0.410763, mae: 4.895397, mean_q: 9.595551
  1341/50000: episode: 42, duration: 0.446s, episode steps:  49,

<keras.callbacks.History at 0x7fe67036ae20>

In [14]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 159.000, steps: 159
Episode 2: reward: 131.000, steps: 131
Episode 3: reward: 183.000, steps: 183
Episode 4: reward: 143.000, steps: 143
Episode 5: reward: 185.000, steps: 185
Episode 6: reward: 188.000, steps: 188
Episode 7: reward: 153.000, steps: 153
Episode 8: reward: 129.000, steps: 129
Episode 9: reward: 147.000, steps: 147
Episode 10: reward: 135.000, steps: 135


<keras.callbacks.History at 0x7fe67036a910>