In [1]:
from sim import Game, Environment

In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [3]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [4]:
model = Sequential()
model.add(Flatten(input_shape = (1, 6)))
model.add(Dense(24, activation = "relu"))
model.add(Dense(24, activation = "relu"))
model.add(Dense(2, activation = "linear"))

In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 6)                 0         
_________________________________________________________________
dense (Dense)                (None, 24)                168       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 50        
Total params: 818
Trainable params: 818
Non-trainable params: 0
_________________________________________________________________


In [6]:
dqn = DQNAgent(
    model = model,
    memory = SequentialMemory(limit = 50000, window_length = 1),
    policy = BoltzmannQPolicy(),
    nb_actions = 2,
    nb_steps_warmup = 10,
    target_model_update = 1e-2
)

In [7]:
dqn.compile(Adam(lr = 1e-3), metrics = ["mae"])

In [8]:
dqn.fit(Environment(), nb_steps = 50000, visualize = False, verbose = 1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
    1/10000 [..............................] - ETA: 4:59 - reward: 0.0000e+00



96 episodes - episode_reward: 1236.000 [1236.000, 1236.000] - loss: 1529.080 - mae: 30.117 - mean_q: 427.026 - time: 330.085

Interval 2 (10000 steps performed)
96 episodes - episode_reward: 1236.000 [1236.000, 1236.000] - loss: 976.927 - mae: 24.352 - mean_q: 497.723 - time: 330.270

Interval 3 (20000 steps performed)
96 episodes - episode_reward: 1236.000 [1236.000, 1236.000] - loss: 164.916 - mae: 18.784 - mean_q: 482.121 - time: 330.438

Interval 4 (30000 steps performed)
96 episodes - episode_reward: 1236.000 [1236.000, 1236.000] - loss: 114.891 - mae: 17.821 - mean_q: 466.858 - time: 330.596

Interval 5 (40000 steps performed)
done, took 207.541 seconds


<tensorflow.python.keras.callbacks.History at 0x1cd2c7c2408>

In [9]:
dqn.test(Environment(), nb_episodes = 100, visualize = False)

Testing for 100 episodes ...
Episode 1: reward: 1236.000, steps: 104


Episode 2: reward: 1236.000, steps: 104
Episode 3: reward: 1236.000, steps: 104
Episode 4: reward: 1236.000, steps: 104
Episode 5: reward: 1236.000, steps: 104
Episode 6: reward: 1236.000, steps: 104
Episode 7: reward: 1236.000, steps: 104
Episode 8: reward: 1236.000, steps: 104
Episode 9: reward: 1236.000, steps: 104
Episode 10: reward: 1236.000, steps: 104
Episode 11: reward: 1236.000, steps: 104
Episode 12: reward: 1236.000, steps: 104
Episode 13: reward: 1236.000, steps: 104
Episode 14: reward: 1236.000, steps: 104
Episode 15: reward: 1236.000, steps: 104
Episode 16: reward: 1236.000, steps: 104
Episode 17: reward: 1236.000, steps: 104
Episode 18: reward: 1236.000, steps: 104
Episode 19: reward: 1236.000, steps: 104
Episode 20: reward: 1236.000, steps: 104
Episode 21: reward: 1236.000, steps: 104
Episode 22: reward: 1236.000, steps: 104
Episode 23: reward: 1236.000, steps: 104
Episode 24: reward: 1236.000, steps: 104
Episode 25: reward: 1236.000, steps: 104
Episode 26: reward: 1236

<tensorflow.python.keras.callbacks.History at 0x1cd2c7d1a48>