In [3]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import glob, re
from collections import deque

In [4]:
NUM_ACTIONS = 5

In [5]:
class DQN:
    def __init__(self):
        self.memory  = deque(maxlen=2000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = models.Sequential()
        model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 2)))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.Dropout(0.4))
        model.add(layers.Flatten())
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(NUM_ACTIONS, activation="softmax"))
        model.compile(optimizer='adam',
              loss="categorical_crossentropy",
              metrics=['accuracy'])
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

In [None]:
gamma   = 0.9
epsilon = .95

trials  = 1000
trial_len = 500

dqn_agent = DQN()
steps = []
for trial in range(trials):
    canvas = np.zeros((28, 28))
    pos = np.random.randint(0,28,size=(2))
    print("initial position", pos)
    for step in range(trial_len):
        action = dqn_agent.act(cur_state)
        
        if action == 0:
            if pos[1] < 27:
                pos = pos + [0, 1]
        elif action == 1:
            if pos[1] > 0:
                pos = pos - [0, 1]
        elif action == 2:
            if pos[0] < 27:
                pos = pos + [1, 0]
        elif action == 3:
            if pos[0] > 0:
                pos = pos - [1, 0]
        elif action == 4:
            canvas[pos] = 1
        
        new_state, reward, done, _ = env.step(action)

        # reward = reward if not done else -20
        new_state = new_state.reshape(1,2)
        dqn_agent.remember(cur_state, action, reward, new_state, done)

        dqn_agent.replay()       # internally iterates default (prediction) model
        dqn_agent.target_train() # iterates target model

        cur_state = new_state
        if done:
            break
    if step >= 199:
        print("Failed to complete in trial {}".format(trial))
        if step % 10 == 0:
            dqn_agent.save_model("trial-{}.model".format(trial))
    else:
        print("Completed in {} trials".format(trial))
        dqn_agent.save_model("success.model")
        break
