In [1]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory


ENV_NAME = 'CartPole-v1'


  "Gym minimally supports python 3.6 as the python foundation not longer supports the version, please update your version to 3.7+"


In [2]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME,render_mode='human')
np.random.seed(123)
#env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                80        
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0

In [3]:
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
print(nb_actions)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights(f'dqn_{ENV_NAME}_weights.h5f', overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)


2
Training for 50000 steps ...




    25/50000: episode: 1, duration: 2.389s, episode steps:  25, steps per second:  10, episode reward: 25.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.600 [0.000, 1.000],  loss: 0.484802, mae: 0.533674, mean_q: 0.118835




    46/50000: episode: 2, duration: 0.931s, episode steps:  21, steps per second:  23, episode reward: 21.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.714 [0.000, 1.000],  loss: 0.341105, mae: 0.535680, mean_q: 0.374708
    63/50000: episode: 3, duration: 0.746s, episode steps:  17, steps per second:  23, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.706 [0.000, 1.000],  loss: 0.227435, mae: 0.552815, mean_q: 0.695594
    86/50000: episode: 4, duration: 1.038s, episode steps:  23, steps per second:  22, episode reward: 23.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.522 [0.000, 1.000],  loss: 0.173656, mae: 0.613299, mean_q: 0.964402
   125/50000: episode: 5, duration: 1.714s, episode steps:  39, steps per second:  23, episode reward: 39.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.590 [0.000, 1.000],  loss: 0.107246, mae: 0.733769, mean_q: 1.247212
   141/50000: episode: 6, duration: 0.713s, episode steps:  16, step

<keras.callbacks.History at 0x7f65c1e87fd0>

In [6]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 9.000, steps: 9
Episode 2: reward: 10.000, steps: 10
Episode 3: reward: 10.000, steps: 10
Episode 4: reward: 9.000, steps: 9
Episode 5: reward: 9.000, steps: 9


<keras.callbacks.History at 0x7ff82918bc50>