In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import tensorflow as tf
import gym
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [6]:
env = gym.make("Taxi-v3").env

In [7]:
action_size = env.action_space.n

model = Sequential()
model.add(Embedding(500, 10, input_length=1))
model.add(Reshape((10,)))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(action_size, activation='linear'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_1 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 50)                550       
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_6 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 306       
Total params: 10,956
Trainable params: 10,956
Non-trainable params: 0
__________________________________________________

In [13]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, 
              nb_actions=action_size, 
              memory=memory, 
              nb_steps_warmup=500, 
              target_model_update=8000, # 1e-2
              policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=1000000, 
       visualize=False, 
       verbose=1, 
       nb_max_episode_steps=99, 
       log_interval=100000)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
 11903/100000 [==>...........................] - ETA: 5:26 - reward: -1.3183done, took 44.246 seconds


<tensorflow.python.keras.callbacks.History at 0x7f4c8db75520>

In [14]:
import time
import numpy as np
from IPython.display import clear_output

In [15]:
def _get_action_for_state(state):
    predicted = model.predict_on_batch(tf.expand_dims(state, axis=0))
    action = np.argmax(predicted[0])
    return action

In [16]:
sleep = 0.2
max_steps = 20


try:
    actions_str = ["South", "North", "East", "West", "Pickup", "Dropoff"]

    iteration = 0
    state = env.reset()  # reset environment to a new, random state
    env.render()
    print(f"Iter: {iteration} - Action: *** - Reward ***")
    time.sleep(sleep)
    done = False

    while not done:
        action = _get_action_for_state(state)
        iteration += 1
        state, reward, done, info = env.step(action)
        clear_output(wait=True)
        env.render()
        print(f"Iter: {iteration} - Action: {action}({actions_str[action]}) - Reward {reward}")
        time.sleep(sleep)
        if iteration == max_steps:
            print("cannot converge :(")
            break
except KeyboardInterrupt:
    pass

+---------+
|R: | :[43m [0m:[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (East)
Iter: 20 - Action: 2(East) - Reward -1
cannot converge :(
