In [1]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import glob, re, sys, random, time
from collections import deque

np.set_printoptions(threshold=sys.maxsize)

In [2]:
NUM_ACTIONS = 5

In [6]:
from tensorflow.keras.optimizers import Adam, RMSprop

class DQN:
    def __init__(self):
        self.memory  = deque(maxlen=50000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.99995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = models.Sequential()
        model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 2)))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.Dropout(0.4))
        model.add(layers.Flatten())
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(NUM_ACTIONS))
        model.compile(loss="mean_squared_error", 
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return np.random.randint(0, NUM_ACTIONS)
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        """
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)
        """
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        #samples = random.sample(self.memory, batch_size)
        #for sample in samples:
        #    state, action, reward, new_state, done = sample
        #    target = self.target_model.predict(state)
        #    if done:
        #        target[0][action] = reward
        #    else:
        #        Q_future = max(self.target_model.predict(new_state)[0])
        #        target[0][action] = reward + Q_future * self.gamma
        #    self.model.fit(state, target, epochs=1, verbose=False)
        
        samples = np.array(random.sample(self.memory, batch_size))
        #print("Samples", samples)
        state = np.stack(samples[:,0])
        action = samples[:,1].astype(int)
        #print("state", state)
        #print("Action", action)
        reward = samples[:,2]
        #print("Reward", reward)
        new_state = np.stack(samples[:,3])
        #print("State", state.shape)
        target = self.target_model.predict(state)
        #print("Target", target)
        terminal = np.array(samples[:,4])
        
        #print("Predict", self.target_model.predict(new_state))
        Q_future = np.max(self.target_model.predict(new_state), axis=1)
        #print("QFUTURE", Q_future)

        terminal_indexes = np.nonzero(terminal)
        result = reward + Q_future * self.gamma
        result[terminal_indexes] = reward[terminal_indexes]

        target[np.arange(len(target)), action] = result
        #print("Target",target)
        self.model.fit(state, target, epochs=1, verbose=False)


    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

    def load_model(self, fn):
        self.model.load_weights(fn)
        self.target_model.load_weights(fn)

In [9]:
NUM_CLASSES = 8

def create_critic_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(NUM_CLASSES, activation="softmax"))
    model.compile(optimizer='adam',
                  loss="categorical_crossentropy",
                  metrics=['accuracy'])
    return model

critic = create_critic_model()
critic.load_weights("models/classifier.h5")

AttributeError: 'Sequential' object has no attribute 'load_model'

In [13]:
def save_canvas(canvas, trial):
    plt.imshow(canvas, cmap="gray")
    plt.savefig(f"output/out-big{trial}.png")
    plt.clf()
    
gamma   = 0.9
epsilon = .95

trials  = 100000
trial_len = 200

dqn_agent = DQN()
dqn_agent.load_model("models/bootstrapbig.h5")
steps = []
for trial in range(trials):
    canvas = np.zeros((28, 28))
    pos = np.random.randint(0,28,size=(2))
    brush_map = np.zeros((28, 28))
    brush_map[pos[0], pos[1]] = 1
    
    reward_sum = 0
    reward_num = 0
    
    #print("initial position", pos)
    for step in range(trial_len):
        old_canvas = canvas.copy()
        old_map = brush_map.copy()
        old_state = np.dstack([old_canvas, old_map])
        
        action = dqn_agent.act(old_state.reshape(1,28,28,2))
        #print(old_state.shape)
        
        if action == 0:
            if pos[1] < 27:
                pos = pos + [0, 1]
        elif action == 1:
            if pos[1] > 0:
                pos = pos - [0, 1]
        elif action == 2:
            if pos[0] < 27:
                pos = pos + [1, 0]
        elif action == 3:
            if pos[0] > 0:
                pos = pos - [1, 0]
        elif action == 4:
            canvas[pos[0], pos[1]] = 1
            
        new_map = np.zeros((28,28))
        new_map[pos[0], pos[1]] = 1
        new_state = np.dstack([canvas, new_map])
        
        done = False # TODO
        
        probs = critic.predict(canvas.reshape(1,28,28,1))[0]
        
        reward = 2*probs[6] - np.sum(probs)

        reward_sum += reward
        reward_num += 1
        
        #print(new_state.shape)
        dqn_agent.remember(old_state, action, reward, new_state, done)

        dqn_agent.replay()       # internally iterates default (prediction) model
        
        if step % 20 == 0:
            dqn_agent.target_train() # iterates target model

        brush_map = new_map
        
        if done:
            break
    
    print("completed trial", trial, "avg_r", reward_sum/reward_num, "epsilon", dqn_agent.epsilon)
    
    save_canvas(canvas, trial)
    
    if trial % 50 == 0:
        print("saving model")
        dqn_agent.save_model(f"models/painter-big{trial}.h5")
    """
    if step >= 199:
        print("Failed to complete in trial {}".format(trial))
        if step % 10 == 0:
            dqn_agent.save_model("trial-{}.model".format(trial))
    else:
        print("Completed in {} trials".format(trial))
        dqn_agent.save_model("success.model")
        break
    """


completed trial 0 avg_r -0.7510541124641895 epsilon 0.9900495862284902
saving model


KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>

In [None]:
np.stack([np.array([[1,2],[3,4]]), np.array([[5,6],[7,8]])]).reshape(2,2,2)