In [1]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import glob, re, sys, random, time
from collections import deque

np.set_printoptions(threshold=sys.maxsize)

In [2]:
NUM_ACTIONS = 5

In [3]:
from tensorflow.keras.optimizers import Adam, RMSprop

class DQN:
    def __init__(self):
        self.memory  = deque(maxlen=50000)
        
        self.gamma = 0.999
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.999995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = models.Sequential()
        model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 2)))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.Dropout(0.4))
        model.add(layers.Flatten())
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(NUM_ACTIONS))
        model.compile(loss="mean_squared_error", 
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return np.random.randint(0, NUM_ACTIONS)
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        """
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            #print(state.shape, action, reward, new_state.shape)
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)
        """
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        #samples = random.sample(self.memory, batch_size)
        #for sample in samples:
        #    state, action, reward, new_state, done = sample
        #    target = self.target_model.predict(state)
        #    if done:
        #        target[0][action] = reward
        #    else:
        #        Q_future = max(self.target_model.predict(new_state)[0])
        #        target[0][action] = reward + Q_future * self.gamma
        #    self.model.fit(state, target, epochs=1, verbose=False)
        
        samples = np.array(random.sample(self.memory, batch_size))
        #print(samples.shape)
        #print("Samples", samples)
        state = np.stack(samples[:,0])
        action = samples[:,1].astype(int)
        #print("state", state.shape)
        #print("Action", action.shape)
        reward = samples[:,2]
        #print("Reward", reward.shape)
        new_state = np.stack(samples[:,3])
        #print("new State", new_state.shape)
        target = self.target_model.predict(state)
        #print("Target", target.shape)
        terminal = np.array(samples[:,4])
        #print("terminal", terminal.shape)
        #return
        #print("Predict", self.target_model.predict(new_state))
        Q_future = np.max(self.target_model.predict(new_state), axis=1)
        #print("QFUTURE", Q_future)

        terminal_indexes = np.nonzero(terminal)
        result = reward + Q_future * self.gamma
        result[terminal_indexes] = reward[terminal_indexes]

        target[np.arange(len(target)), action] = result
        #print("Target",target)
        self.model.fit(state, target, epochs=1, verbose=False)


    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

    def load_model(self, fn):
        self.model.load_weights(fn)
        self.target_model.load_weights(fn)

In [4]:
NUM_CLASSES = 8

def create_critic_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(NUM_CLASSES, activation="softmax"))
    model.compile(optimizer='adam',
                  loss="categorical_crossentropy",
                  metrics=['accuracy'])
    return model

critic = create_critic_model()
critic.load_weights("models/classifier.h5")

In [6]:
def save_canvas(canvas, trial):
    plt.imshow(canvas, cmap="gray")
    plt.savefig(f"output/out-penaltyborder-{trial}.png")
    plt.clf()
    
gamma   = 0.9
epsilon = .95

trials  = 100000
trial_len = 200

dqn_agent = DQN()
dqn_agent.load_model("models/bootstrapbig.h5")
steps = []
for trial in range(trials):
    canvas = np.zeros((28, 28))
    pos = np.random.randint(0,28,size=(2))
    brush_map = np.zeros((28, 28))
    brush_map[pos[0], pos[1]] = 1
    
    reward_sum = 0
    reward_num = 0
    
    #print("initial position", pos)
    for step in range(trial_len):
        old_canvas = canvas.copy()
        old_map = brush_map.copy()
        old_state = np.dstack([old_canvas, old_map])
        
        action = dqn_agent.act(old_state.reshape(1,28,28,2))
        #print(old_state.shape)
        
        if action == 0:
            if pos[1] < 27:
                pos = pos + [0, 1]
        elif action == 1:
            if pos[1] > 0:
                pos = pos - [0, 1]
        elif action == 2:
            if pos[0] < 27:
                pos = pos + [1, 0]
        elif action == 3:
            if pos[0] > 0:
                pos = pos - [1, 0]
        elif action == 4:
            canvas[pos[0], pos[1]] = 1
            
        new_map = np.zeros((28,28))
        new_map[pos[0], pos[1]] = 1
        new_state = np.dstack([canvas, new_map])
        
        done = False # TODO
        
        probs = critic.predict(canvas.reshape(1,28,28,1))[0]
        
        reward = probs[6]/(np.sum(probs)+0.0000000001)
        # border penalty
        reward -= 100*(np.sum(canvas[0])+np.sum(canvas[27])+np.sum(canvas[:,0])+np.sum(canvas[:,27]))
        
        reward_sum += reward
        reward_num += 1
        
        #print(new_state.shape)
        dqn_agent.remember(old_state, action, reward, new_state, done)

        dqn_agent.replay()       # internally iterates default (prediction) model
        
        if step % 20 == 0:
            dqn_agent.target_train() # iterates target model

        brush_map = new_map
        
        if done:
            break
    
    print("completed trial", trial, "avg_r", reward_sum/reward_num, "epsilon", dqn_agent.epsilon)
    
    save_canvas(canvas, trial)
    
    if trial % 50 == 0:
        print("saving model")
        dqn_agent.save_model(f"models/painter-gamma-penaltyborder-{trial}.h5")
    """
    if step >= 199:
        print("Failed to complete in trial {}".format(trial))
        if step % 10 == 0:
            dqn_agent.save_model("trial-{}.model".format(trial))
    else:
        print("Completed in {} trials".format(trial))
        dqn_agent.save_model("success.model")
        break
    """


completed trial 0 avg_r 0.09820423930979921 epsilon 0.9990004973358593
saving model
completed trial 1 avg_r -475.3932498286306 epsilon 0.998001993677293
completed trial 2 avg_r -268.91810300480046 epsilon 0.9970044880257947
completed trial 3 avg_r -21.974558341598712 epsilon 0.9960079793838524
completed trial 4 avg_r -34.43021605802636 epsilon 0.9950124667549526
completed trial 5 avg_r -9.931813235329138 epsilon 0.9940179491435779
completed trial 6 avg_r -385.39932756028225 epsilon 0.9930244255552048
completed trial 7 avg_r -27.95311104186308 epsilon 0.9920318949963041
completed trial 8 avg_r -25.91006161174032 epsilon 0.9910403564743417
completed trial 9 avg_r 0.023161578407419295 epsilon 0.9900498089977742
completed trial 10 avg_r -316.89597126845223 epsilon 0.9890602515760483
completed trial 11 avg_r -129.94039473864214 epsilon 0.9880716832196017
completed trial 12 avg_r -517.405368185892 epsilon 0.9870841029398615
completed trial 13 avg_r -76.43469074292541 epsilon 0.98609750974924

completed trial 114 avg_r 0.07352205783437628 epsilon 0.8913658876375748
completed trial 115 avg_r -85.92323027675306 epsilon 0.8904749650581566
completed trial 116 avg_r -52.94326742389262 epsilon 0.8895849329582296
completed trial 117 avg_r -61.43815766955322 epsilon 0.888695790447758
completed trial 118 avg_r -331.93679249175466 epsilon 0.8878075366375948
completed trial 119 avg_r 0.07474461437959197 epsilon 0.8869201706394808
completed trial 120 avg_r 0.06867415982773312 epsilon 0.8860336915660468
completed trial 121 avg_r -249.90344083007304 epsilon 0.8851480985308078
completed trial 122 avg_r -230.92249054634033 epsilon 0.884263390648168
completed trial 123 avg_r -199.91478537450655 epsilon 0.8833795670334126
completed trial 124 avg_r -184.4354105860204 epsilon 0.8824966268027146
completed trial 125 avg_r -99.44510851701969 epsilon 0.881614569073129
completed trial 126 avg_r -909.4022520407151 epsilon 0.8807333929625955
completed trial 127 avg_r -288.8937848407106 epsilon 0.87985

completed trial 227 avg_r -134.45655242835838 epsilon 0.796123806042055
completed trial 228 avg_r -167.40510217433908 epsilon 0.7953280781769294
completed trial 229 avg_r 0.09470704520817766 epsilon 0.7945331456439253
completed trial 230 avg_r -260.93695493514304 epsilon 0.7937390076481067
completed trial 231 avg_r 0.09088374349915083 epsilon 0.7929456633953297
completed trial 232 avg_r 0.034466891467433466 epsilon 0.7921531120922469
completed trial 233 avg_r -22.453951622274968 epsilon 0.7913613529463024
completed trial 234 avg_r -23.454852967167795 epsilon 0.7905703851657335
completed trial 235 avg_r 0.07201488795755445 epsilon 0.7897802079595686
completed trial 236 avg_r -296.4239145351941 epsilon 0.788990820537627
completed trial 237 avg_r -149.42432796305198 epsilon 0.7882022221105165
completed trial 238 avg_r 0.06052222047569163 epsilon 0.7874144118896342
completed trial 239 avg_r 0.03540452362946272 epsilon 0.7866273890871681
completed trial 240 avg_r -191.45019095476323 epsilon

completed trial 340 avg_r -188.43459901231176 epsilon 0.711058302025345
completed trial 341 avg_r -171.89624235691957 epsilon 0.7103475973581113
completed trial 342 avg_r -162.4763672497098 epsilon 0.7096376030420852
completed trial 343 avg_r 0.06758963006240705 epsilon 0.7089283183672695
completed trial 344 avg_r -203.42390767124832 epsilon 0.7082197426243764
completed trial 345 avg_r -74.44048907163966 epsilon 0.7075118751048263
completed trial 346 avg_r 0.06224407170138032 epsilon 0.706804715100747
completed trial 347 avg_r 0.09034396829392677 epsilon 0.7060982619049757
completed trial 348 avg_r -150.3955113084407 epsilon 0.705392514811056
completed trial 349 avg_r -260.904626586222 epsilon 0.7046874731132381
completed trial 350 avg_r -159.92313311087216 epsilon 0.7039831361064753
saving model
completed trial 351 avg_r -428.904880174967 epsilon 0.703279503086426
completed trial 352 avg_r -406.3989688444888 epsilon 0.7025765733494553
completed trial 353 avg_r -352.90855727949105 epsi

completed trial 453 avg_r 0.08456140194775895 epsilon 0.6350820124231575
completed trial 454 avg_r -254.41221037175845 epsilon 0.6344472462597933
completed trial 455 avg_r -338.90381052419303 epsilon 0.6338131145468994
completed trial 456 avg_r -99.46126530664536 epsilon 0.6331796166503427
completed trial 457 avg_r 0.11694563448574437 epsilon 0.6325467519366216
completed trial 458 avg_r -310.41040901019403 epsilon 0.6319145197728676
completed trial 459 avg_r 0.06670023665933845 epsilon 0.6312829195268451
completed trial 460 avg_r 0.02644942296343917 epsilon 0.6306519505669513
completed trial 461 avg_r 0.08739192621384362 epsilon 0.6300216122622141
completed trial 462 avg_r -252.4345739297469 epsilon 0.6293919039822918
completed trial 463 avg_r -142.9445294172102 epsilon 0.6287628250974726
completed trial 464 avg_r 0.07727542845151512 epsilon 0.6281343749786745
completed trial 465 avg_r -185.45624048369652 epsilon 0.6275065529974451
completed trial 466 avg_r -296.9377358896716 epsilon 0

completed trial 566 avg_r -179.41915588201687 epsilon 0.5672237583818633
completed trial 567 avg_r -400.40348532655537 epsilon 0.5666568167241963
completed trial 568 avg_r -308.40293155574227 epsilon 0.5660904417262266
completed trial 569 avg_r -267.40963768005105 epsilon 0.565524632821577
completed trial 570 avg_r -221.90583863072544 epsilon 0.5649593894444342
completed trial 571 avg_r -247.40758709170572 epsilon 0.5643947110295534
completed trial 572 avg_r -269.4093295305361 epsilon 0.5638305970122518
completed trial 573 avg_r -32.9215580892123 epsilon 0.5632670468284141
completed trial 574 avg_r -170.4319427443819 epsilon 0.5627040599144865
completed trial 575 avg_r -20.423614550860012 epsilon 0.5621416357074787
completed trial 576 avg_r -197.91436694211419 epsilon 0.5615797736449646
completed trial 577 avg_r -65.41867407134956 epsilon 0.5610184731650788
completed trial 578 avg_r -268.4224441852549 epsilon 0.5604577337065176
completed trial 579 avg_r -379.9045409115868 epsilon 0.559

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>

In [None]:
np.stack([np.array([[1,2],[3,4]]), np.array([[5,6],[7,8]])]).reshape(2,2,2)