In [None]:
import gym

import numpy as np

# Print the list of available OpenAI Gym environments
print("List of available OpenAI Gym environments.\n")
for i in gym.envs.registry.all():
	print(str(i)[8:-1])

# Get the environment name and number of episodes to run
string = input("\nEnter the name of the environment: ")
episodes = int(input("Enter the number of episodes to run: "))

# Create the environment and reset it to the initial state
env = gym.make(string)
np.random.seed(123)
env.seed(123)

if type(env.action_space)==gym.spaces.box.Box:
	print("Cannot train RL agent for environments with Box() space.")
	exit(0)

nb_actions = env.action_space.n

In [8]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Flatten

from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Complex Neural Network for DQN, SARSA
CD_model = Sequential()
CD_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(16))
CD_model.add(Activation('relu'))
CD_model.add(Dense(nb_actions))
CD_model.add(Activation('linear'))

# Boltzmann Q Policy
BQ_policy = BoltzmannQPolicy()

# Sequential Memory
S_memory = SequentialMemory(limit=100000, window_length=1)

In [9]:
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.dqn import DQNAgent

name = f'dqn_{string}_params.h5f'
dqn = DQNAgent(model=CD_model, nb_actions=nb_actions, memory=S_memory, nb_steps_warmup=100, target_model_update=1e-2, policy=BQ_policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)
dqn.save_weights(name, overwrite=True)

Training for 50000 steps ...
    57/50000: episode: 1, duration: 0.133s, episode steps:  57, steps per second: 427, episode reward: 57.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.491 [0.000, 1.000],  loss: --, mae: --, mean_q: --
    68/50000: episode: 2, duration: 0.008s, episode steps:  11, steps per second: 1411, episode reward: 11.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.727 [0.000, 1.000],  loss: --, mae: --, mean_q: --
    85/50000: episode: 3, duration: 0.011s, episode steps:  17, steps per second: 1576, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.588 [0.000, 1.000],  loss: --, mae: --, mean_q: --


  updates=self.state_updates,
  updates=self.state_updates,


   112/50000: episode: 4, duration: 0.998s, episode steps:  27, steps per second:  27, episode reward: 27.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.444 [0.000, 1.000],  loss: 0.499148, mae: 0.520363, mean_q: 0.057240
   124/50000: episode: 5, duration: 0.053s, episode steps:  12, steps per second: 227, episode reward: 12.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.667 [0.000, 1.000],  loss: 0.420611, mae: 0.511147, mean_q: 0.147164
   179/50000: episode: 6, duration: 0.233s, episode steps:  55, steps per second: 236, episode reward: 55.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.527 [0.000, 1.000],  loss: 0.212256, mae: 0.552020, mean_q: 0.560593
   194/50000: episode: 7, duration: 0.066s, episode steps:  15, steps per second: 228, episode reward: 15.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.533 [0.000, 1.000],  loss: 0.047068, mae: 0.659355, mean_q: 1.118700
   208/50000: episode: 8, duration: 0.061s, episode steps:  14, step

   919/50000: episode: 42, duration: 0.149s, episode steps:  26, steps per second: 175, episode reward: 26.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.462 [0.000, 1.000],  loss: 0.321444, mae: 3.388262, mean_q: 6.468228
   947/50000: episode: 43, duration: 0.147s, episode steps:  28, steps per second: 190, episode reward: 28.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.464 [0.000, 1.000],  loss: 0.161534, mae: 3.510058, mean_q: 6.839166
   963/50000: episode: 44, duration: 0.076s, episode steps:  16, steps per second: 209, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.375 [0.000, 1.000],  loss: 0.204501, mae: 3.562030, mean_q: 6.946006
   997/50000: episode: 45, duration: 0.153s, episode steps:  34, steps per second: 222, episode reward: 34.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.471 [0.000, 1.000],  loss: 0.173304, mae: 3.672399, mean_q: 7.203074
  1047/50000: episode: 46, duration: 0.224s, episode steps:  50,

In [10]:
dqn.test(env, nb_episodes=episodes, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 212.000, steps: 212
Episode 2: reward: 201.000, steps: 201
Episode 3: reward: 172.000, steps: 172
Episode 4: reward: 491.000, steps: 491
Episode 5: reward: 186.000, steps: 186
Episode 6: reward: 261.000, steps: 261
Episode 7: reward: 412.000, steps: 412
Episode 8: reward: 300.000, steps: 300
Episode 9: reward: 169.000, steps: 169
Episode 10: reward: 220.000, steps: 220


<keras.callbacks.History at 0x7fd8bc6429a0>