# Sandbox

In [25]:
import gym
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Embedding, Reshape
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.callbacks import FileLogger, ModelIntervalCheckpoint
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [3]:


env = gym.make("Taxi-v3").env


In [18]:
action = env.action_space.sample()
action

1

In [19]:
env.step(action)

(162, -1, False, {'prob': 1.0})

In [23]:
ENV_NAME = 'Taxi-v3'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
action_size = env.action_space.n

In [24]:
model = Sequential()
model.add(Embedding(500, 10, input_length=1))
model.add(Reshape((10,)))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(action_size, activation='linear'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_1 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                550       
_________________________________________________________________
dense_4 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 306       
Total params: 10,956
Trainable params: 10,956
Non-trainable params: 0
__________________________________________________

In [26]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [29]:

callbacks = [FileLogger('train.log', interval=100)]
dqn.fit(env, nb_steps=10000, callbacks=callbacks, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=1000)

Training for 10000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
10 episodes - episode_reward: -256.500 [-855.000, -99.000] - loss: 1.763 - mae: 3.372 - mean_q: -1.412 - prob: 1.000

Interval 2 (1000 steps performed)
10 episodes - episode_reward: -129.600 [-162.000, -108.000] - loss: 0.213 - mae: 8.187 - mean_q: -6.387 - prob: 1.000

Interval 3 (2000 steps performed)
10 episodes - episode_reward: -124.200 [-153.000, -108.000] - loss: 0.585 - mae: 12.614 - mean_q: -11.496 - prob: 1.000

Interval 4 (3000 steps performed)
10 episodes - episode_reward: -123.300 [-135.000, -108.000] - loss: 0.985 - mae: 15.368 - mean_q: -14.703 - prob: 1.000

Interval 5 (4000 steps performed)
10 episodes - episode_reward: -123.300 [-144.000, -108.000] - loss: 1.228 - mae: 16.794 - mean_q: -16.278 - prob: 1.000

Interval 6 (5000 steps performed)
10 episodes - episode_reward: -149.400 [-288.000, -99

<tensorflow.python.keras.callbacks.History at 0x7fb195cd4a60>

In [32]:

dqn.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=99)

---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m|[43m [0m: |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: |

<tensorflow.python.keras.callbacks.History at 0x7fb18c45a610>

ModuleNotFoundError: No module named 'taxi_v3'