In [1]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import glob, re, sys, random, time
from collections import deque

np.set_printoptions(threshold=sys.maxsize)

In [2]:
NUM_ACTIONS = 5

In [6]:
from tensorflow.keras.optimizers import Adam, RMSprop

class DQN:
    def __init__(self):
        self.memory  = deque(maxlen=50000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.999995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model = models.Sequential()
        model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 2)))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.Dropout(0.4))
        model.add(layers.Flatten())
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(NUM_ACTIONS))
        model.compile(loss="mean_squared_error", 
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return np.random.randint(0, NUM_ACTIONS)
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        """
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)
        """
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        #samples = random.sample(self.memory, batch_size)
        #for sample in samples:
        #    state, action, reward, new_state, done = sample
        #    target = self.target_model.predict(state)
        #    if done:
        #        target[0][action] = reward
        #    else:
        #        Q_future = max(self.target_model.predict(new_state)[0])
        #        target[0][action] = reward + Q_future * self.gamma
        #    self.model.fit(state, target, epochs=1, verbose=False)
        
        samples = np.array(random.sample(self.memory, batch_size))
        #print("Samples", samples)
        state = np.stack(samples[:,0])
        action = samples[:,1].astype(int)
        #print("state", state)
        #print("Action", action)
        reward = samples[:,2]
        #print("Reward", reward)
        new_state = np.stack(samples[:,3])
        #print("State", state.shape)
        target = self.target_model.predict(state)
        #print("Target", target)
        terminal = np.array(samples[:,4])
        
        #print("Predict", self.target_model.predict(new_state))
        Q_future = np.max(self.target_model.predict(new_state), axis=1)
        #print("QFUTURE", Q_future)

        terminal_indexes = np.nonzero(terminal)
        result = reward + Q_future * self.gamma
        result[terminal_indexes] = reward[terminal_indexes]

        target[np.arange(len(target)), action] = result
        #print("Target",target)
        self.model.fit(state, target, epochs=1, verbose=False)


    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

    def load_model(self, fn):
        self.model.load_weights(fn)
        self.target_model.load_weights(fn)

In [7]:
NUM_CLASSES = 8

def create_critic_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(NUM_CLASSES, activation="softmax"))
    model.compile(optimizer='adam',
                  loss="categorical_crossentropy",
                  metrics=['accuracy'])
    return model

critic = create_critic_model()
critic.load_weights("models/classifier.h5")

In [None]:
def save_canvas(canvas, trial):
    plt.imshow(canvas, cmap="gray")
    plt.savefig(f"output/out{trial}.png")
    plt.clf()

gamma   = 0.9
epsilon = .95

trials  = 100000
trial_len = 200

dqn_agent = DQN()
steps = []
for trial in range(trials):
    canvas = np.zeros((28, 28))
    pos = np.random.randint(0,28,size=(2))
    brush_map = np.zeros((28, 28))
    brush_map[pos[0], pos[1]] = 1
    
    reward_sum = 0
    reward_num = 0
    
    #print("initial position", pos)
    for step in range(trial_len):
        old_canvas = canvas.copy()
        old_map = brush_map.copy()
        old_state = np.stack([old_canvas, old_map]).reshape(28,28,2)
        
        action = dqn_agent.act(old_state.reshape(1,28,28,2))
        #print(old_state.shape)
        
        if action == 0:
            if pos[1] < 27:
                pos = pos + [0, 1]
        elif action == 1:
            if pos[1] > 0:
                pos = pos - [0, 1]
        elif action == 2:
            if pos[0] < 27:
                pos = pos + [1, 0]
        elif action == 3:
            if pos[0] > 0:
                pos = pos - [1, 0]
        elif action == 4:
            canvas[pos[0], pos[1]] = 1
            
        new_map = np.zeros((28,28))
        new_map[pos[0], pos[1]] = 1
        new_state = np.stack([canvas, new_map]).reshape(28,28,2)
        
        done = False # TODO
        
        probs = critic.predict(canvas.reshape(1,28,28,1))[0]
        
        reward = 2*probs[6] - np.sum(probs)

        reward_sum += reward
        reward_num += 1
        
        #print(new_state.shape)
        dqn_agent.remember(old_state, action, reward, new_state, done)

        dqn_agent.replay()       # internally iterates default (prediction) model
        
        if step % 20 == 0:
            dqn_agent.target_train() # iterates target model

        brush_map = new_map
        
        if done:
            break
    
    print("completed trial", trial, "avg_r", reward_sum/reward_num, "epsilon", dqn_agent.epsilon)
    
    save_canvas(canvas, trial)
    
    if trial % 50 == 0:
        print("saving model")
        dqn_agent.save_model(f"models/painter{trial}.h5")
    """
    if step >= 199:
        print("Failed to complete in trial {}".format(trial))
        if step % 10 == 0:
            dqn_agent.save_model("trial-{}.model".format(trial))
    else:
        print("Completed in {} trials".format(trial))
        dqn_agent.save_model("success.model")
        break
    """


completed trial 0 avg_r -0.9027960537234321 epsilon 0.9990004973358593
saving model
completed trial 1 avg_r -0.8477145900717005 epsilon 0.998001993677293
completed trial 2 avg_r -0.8618156395107508 epsilon 0.9970044880257947
completed trial 3 avg_r -0.9610463162884116 epsilon 0.9960079793838524
completed trial 4 avg_r -0.8021558324247599 epsilon 0.9950124667549526
completed trial 5 avg_r -0.9238369425293058 epsilon 0.9940179491435779
completed trial 6 avg_r -0.9019183308817447 epsilon 0.9930244255552048
completed trial 7 avg_r -0.8436796277761459 epsilon 0.9920318949963041
completed trial 8 avg_r -0.8361261558532714 epsilon 0.9910403564743417
completed trial 9 avg_r -0.7986924652755261 epsilon 0.9900498089977742
completed trial 10 avg_r -0.8934165979549289 epsilon 0.9890602515760483
completed trial 11 avg_r -0.8766650379635393 epsilon 0.9880716832196017
completed trial 12 avg_r -0.8825178686901927 epsilon 0.9870841029398615
completed trial 13 avg_r -0.7968598581850529 epsilon 0.9860975

completed trial 114 avg_r -0.7990866686403751 epsilon 0.8913658876375748
completed trial 115 avg_r -0.9254344735760242 epsilon 0.8904749650581566
completed trial 116 avg_r -0.8210363841801882 epsilon 0.8895849329582296
completed trial 117 avg_r -0.8342485278844833 epsilon 0.888695790447758
completed trial 118 avg_r -0.8252910735574551 epsilon 0.8878075366375948
completed trial 119 avg_r -0.8327068918198347 epsilon 0.8869201706394808
completed trial 120 avg_r -0.8624187102168799 epsilon 0.8860336915660468
completed trial 121 avg_r -0.9647678471519612 epsilon 0.8851480985308078
completed trial 122 avg_r -0.8120898485183716 epsilon 0.884263390648168
completed trial 123 avg_r -0.8337413034588098 epsilon 0.8833795670334126
completed trial 124 avg_r -0.8212813670560718 epsilon 0.8824966268027146
completed trial 125 avg_r -0.8738590276613831 epsilon 0.881614569073129
completed trial 126 avg_r -0.8398866156488657 epsilon 0.8807333929625955
completed trial 127 avg_r -0.9587335894023999 epsilon 

completed trial 227 avg_r -0.7962488493323326 epsilon 0.796123806042055
completed trial 228 avg_r -0.8823189412802458 epsilon 0.7953280781769294
completed trial 229 avg_r -0.8569884122535586 epsilon 0.7945331456439253
completed trial 230 avg_r -0.7784396842867136 epsilon 0.7937390076481067
completed trial 231 avg_r -0.8746319182962179 epsilon 0.7929456633953297
completed trial 232 avg_r -0.8404233567416668 epsilon 0.7921531120922469
completed trial 233 avg_r -0.8225015987828374 epsilon 0.7913613529463024
completed trial 234 avg_r -0.8118639573454857 epsilon 0.7905703851657335
completed trial 235 avg_r -0.8808928336575628 epsilon 0.7897802079595686
completed trial 236 avg_r -0.7957492839172482 epsilon 0.788990820537627
completed trial 237 avg_r -0.8132007642090321 epsilon 0.7882022221105165
completed trial 238 avg_r -0.8221230021864175 epsilon 0.7874144118896342
completed trial 239 avg_r -0.8007060302048922 epsilon 0.7866273890871681
completed trial 240 avg_r -0.812528526559472 epsilon 

completed trial 340 avg_r -0.8249715375155211 epsilon 0.711058302025345
completed trial 341 avg_r -0.9116147679835558 epsilon 0.7103475973581113
completed trial 342 avg_r -0.7778383857756853 epsilon 0.7096376030420852
completed trial 343 avg_r -0.8015173398703337 epsilon 0.7089283183672695
completed trial 344 avg_r -0.9099839654378593 epsilon 0.7082197426243764
completed trial 345 avg_r -0.8611230596154928 epsilon 0.7075118751048263
completed trial 346 avg_r -0.7801901840418577 epsilon 0.706804715100747
completed trial 347 avg_r -0.7999015881121159 epsilon 0.7060982619049757
completed trial 348 avg_r -0.8127717061340809 epsilon 0.705392514811056
completed trial 349 avg_r -0.8951626062020659 epsilon 0.7046874731132381
completed trial 350 avg_r -0.840722163617611 epsilon 0.7039831361064753
saving model
completed trial 351 avg_r -0.8074987002462148 epsilon 0.703279503086426
completed trial 352 avg_r -0.8276051090657711 epsilon 0.7025765733494553
completed trial 353 avg_r -0.94449598259641

completed trial 453 avg_r -0.8261370968818664 epsilon 0.6350820124231575
completed trial 454 avg_r -0.9055459365993738 epsilon 0.6344472462597933
completed trial 455 avg_r -0.7924535851180553 epsilon 0.6338131145468994
completed trial 456 avg_r -0.824764065593481 epsilon 0.6331796166503427
completed trial 457 avg_r -0.8118761670589447 epsilon 0.6325467519366216
completed trial 458 avg_r -0.8252489711344242 epsilon 0.6319145197728676
completed trial 459 avg_r -0.8081396619975567 epsilon 0.6312829195268451
completed trial 460 avg_r -0.8248186857998371 epsilon 0.6306519505669513
completed trial 461 avg_r -0.8281417345255613 epsilon 0.6300216122622141
completed trial 462 avg_r -0.7957540109753609 epsilon 0.6293919039822918
completed trial 463 avg_r -0.8842962764389813 epsilon 0.6287628250974726
completed trial 464 avg_r -0.8781409868970513 epsilon 0.6281343749786745
completed trial 465 avg_r -0.9429120155237615 epsilon 0.6275065529974451
completed trial 466 avg_r -0.935850010374561 epsilon

completed trial 566 avg_r -0.9052110955864191 epsilon 0.5672237583818633
completed trial 567 avg_r -0.8098040726780892 epsilon 0.5666568167241963
completed trial 568 avg_r -0.9394170817732811 epsilon 0.5660904417262266
completed trial 569 avg_r -0.7862993161380291 epsilon 0.565524632821577
completed trial 570 avg_r -0.9645644168276339 epsilon 0.5649593894444342
completed trial 571 avg_r -0.8284014116227627 epsilon 0.5643947110295534
completed trial 572 avg_r -0.7889040768146515 epsilon 0.5638305970122518
completed trial 573 avg_r -0.8102194055914879 epsilon 0.5632670468284141
completed trial 574 avg_r -0.84621904887259 epsilon 0.5627040599144865
completed trial 575 avg_r -0.847728883549571 epsilon 0.5621416357074787
completed trial 576 avg_r -0.8701749563962221 epsilon 0.5615797736449646
completed trial 577 avg_r -0.7994004116207362 epsilon 0.5610184731650788
completed trial 578 avg_r -0.8498932892084121 epsilon 0.5604577337065176
completed trial 579 avg_r -0.8285602385550738 epsilon 0

completed trial 679 avg_r -0.7852928557246923 epsilon 0.5066161311123312
completed trial 680 avg_r -0.8432387129217386 epsilon 0.5061097669395875
completed trial 681 avg_r -0.9095582399144768 epsilon 0.5056039088791832
completed trial 682 avg_r -0.9077259378600866 epsilon 0.5050985564252585
completed trial 683 avg_r -0.8091245286166668 epsilon 0.5045937090724573
completed trial 684 avg_r -0.8976077158562838 epsilon 0.5040893663159313
completed trial 685 avg_r -0.8730140199884773 epsilon 0.5035855276513335
completed trial 686 avg_r -0.9143975894525647 epsilon 0.5030821925748228
completed trial 687 avg_r -0.856278814598918 epsilon 0.5025793605830632
completed trial 688 avg_r -0.8631398269720375 epsilon 0.5020770311732181
completed trial 689 avg_r -0.8257529079169035 epsilon 0.5015752038429572
completed trial 690 avg_r -0.8172617198526859 epsilon 0.5010738780904497
completed trial 691 avg_r -0.7938585667312146 epsilon 0.5005730534143664
completed trial 692 avg_r -0.8713243712484836 epsilo

completed trial 791 avg_r -0.8510910976678133 epsilon 0.45293711595486447
completed trial 792 avg_r -0.8207019370049238 epsilon 0.45248440410077895
completed trial 793 avg_r -0.8215235093981028 epsilon 0.45203214473339803
completed trial 794 avg_r -0.8943670210242272 epsilon 0.45158033740045966
completed trial 795 avg_r -0.8197880981117487 epsilon 0.4511289816501541
completed trial 796 avg_r -0.8037310726940632 epsilon 0.4506780770311235
completed trial 797 avg_r -0.8502309546619654 epsilon 0.4502276230924613
completed trial 798 avg_r -0.8054966835677624 epsilon 0.44977761938371047
completed trial 799 avg_r -0.8791825048439205 epsilon 0.44932806545486526
completed trial 800 avg_r -0.7888080945611 epsilon 0.4488789608563697
saving model
completed trial 801 avg_r -0.8325737477838993 epsilon 0.4484303051391168
completed trial 802 avg_r -0.8113732967525721 epsilon 0.4479820978544485
completed trial 803 avg_r -0.8153962672501802 epsilon 0.44753433855415553
completed trial 804 avg_r -0.82435

completed trial 903 avg_r -0.7736506511271 epsilon 0.40494571414272934
completed trial 904 avg_r -0.7805449851602316 epsilon 0.4045409698226115
completed trial 905 avg_r -0.8307234411686659 epsilon 0.40413663004551986
completed trial 906 avg_r -0.8535752876102924 epsilon 0.40373269440711224
completed trial 907 avg_r -0.8559184237197042 epsilon 0.4033291625034516
completed trial 908 avg_r -0.7996880439668894 epsilon 0.40292603393100374
completed trial 909 avg_r -0.8145732216536998 epsilon 0.4025233082866374
completed trial 910 avg_r -0.8314543139562011 epsilon 0.40212098516762573
completed trial 911 avg_r -0.8994585975632071 epsilon 0.40171906417164355
completed trial 912 avg_r -0.8453267172724008 epsilon 0.40131754489676746
completed trial 913 avg_r -0.8115882954746485 epsilon 0.4009164269414764
completed trial 914 avg_r -0.9313194210501388 epsilon 0.4005157099046507
completed trial 915 avg_r -0.8135279500484467 epsilon 0.40011539338557073
completed trial 916 avg_r -0.8247747976332903 

completed trial 1015 avg_r -0.8777973837405443 epsilon 0.3620392889570683
completed trial 1016 avg_r -0.8589914649352431 epsilon 0.36167742972323186
completed trial 1017 avg_r -0.8142391815781593 epsilon 0.3613159321686637
completed trial 1018 avg_r -0.8769439540384337 epsilon 0.3609547959318643
completed trial 1019 avg_r -0.8268270426243544 epsilon 0.36059402065169566
completed trial 1020 avg_r -0.7935642448067665 epsilon 0.3602336059673809
completed trial 1021 avg_r -0.8519697546213866 epsilon 0.35987355151850325
completed trial 1022 avg_r -0.9250856358744204 epsilon 0.3595138569450067
completed trial 1023 avg_r -0.8036014053225518 epsilon 0.35915452188719466
completed trial 1024 avg_r -0.7631584326177836 epsilon 0.35879554598573005
completed trial 1025 avg_r -0.8072309396415949 epsilon 0.35843692888163536
completed trial 1026 avg_r -0.8168544864654541 epsilon 0.35807867021629153
completed trial 1027 avg_r -0.9074577796086669 epsilon 0.3577207696314383
completed trial 1028 avg_r -0.8

completed trial 1125 avg_r -0.875608319863677 epsilon 0.32432706417573737
completed trial 1126 avg_r -0.8209954830259085 epsilon 0.32400289841104063
completed trial 1127 avg_r -0.8175519379228353 epsilon 0.3236790566508893
completed trial 1128 avg_r -0.8142093414068222 epsilon 0.3233555385714402
completed trial 1129 avg_r -0.8266812014952302 epsilon 0.32303234384917356
completed trial 1130 avg_r -0.933673177738674 epsilon 0.32270947216089274
completed trial 1131 avg_r -0.8460339304804801 epsilon 0.32238692318372486
completed trial 1132 avg_r -0.9109560196101666 epsilon 0.32206469659511855
completed trial 1133 avg_r -0.8366777127981186 epsilon 0.32174279207284534
completed trial 1134 avg_r -0.8429930011928082 epsilon 0.32142120929500023
completed trial 1135 avg_r -0.8047251265496016 epsilon 0.32109994793999846
completed trial 1136 avg_r -0.8865390511229634 epsilon 0.3207790076865765
completed trial 1137 avg_r -0.8084526088833809 epsilon 0.32045838821379347
completed trial 1138 avg_r -0.

completed trial 1235 avg_r -0.8666580283641815 epsilon 0.2905431751892774
completed trial 1236 avg_r -0.8941528368368745 epsilon 0.29025277651162773
completed trial 1237 avg_r -0.8241025368124246 epsilon 0.2899626680882302
completed trial 1238 avg_r -0.8985532929375768 epsilon 0.2896728496289746
completed trial 1239 avg_r -0.861129501350224 epsilon 0.289383320844041
completed trial 1240 avg_r -0.8286802623420954 epsilon 0.2890940814438995
completed trial 1241 avg_r -0.8823689927905798 epsilon 0.288805131139309
completed trial 1242 avg_r -0.8596647316217423 epsilon 0.28851646964131755
completed trial 1243 avg_r -0.7930793242156505 epsilon 0.28822809666126237
completed trial 1244 avg_r -0.93721993137151 epsilon 0.2879400119107691
completed trial 1245 avg_r -0.8116655611246825 epsilon 0.28765221510175104
completed trial 1246 avg_r -0.810436106696725 epsilon 0.28736470594641067
completed trial 1247 avg_r -0.8132561534643173 epsilon 0.28707748415723666
completed trial 1248 avg_r -0.81830332

completed trial 1345 avg_r -0.8413860649615527 epsilon 0.26027842253499295
completed trial 1346 avg_r -0.8380769491195679 epsilon 0.26001827355825075
completed trial 1347 avg_r -0.8966262555122375 epsilon 0.2597583846011042
completed trial 1348 avg_r -0.8135399542748928 epsilon 0.25949875540366246
completed trial 1349 avg_r -0.7906758429855109 epsilon 0.2592393857062953
completed trial 1350 avg_r -0.8174421597272158 epsilon 0.25898027524963163
saving model
completed trial 1351 avg_r -0.8106962969154119 epsilon 0.25872142377455976
completed trial 1352 avg_r -0.832940181195736 epsilon 0.2584628310222266
completed trial 1353 avg_r -0.7766512548923492 epsilon 0.25820449673403856
completed trial 1354 avg_r -0.8128487020730972 epsilon 0.25794642065165957
completed trial 1355 avg_r -0.8175448168069124 epsilon 0.25768860251701253
completed trial 1356 avg_r -0.9199407255090774 epsilon 0.2574310420722781
completed trial 1357 avg_r -0.8178379444777966 epsilon 0.2571737390598944
completed trial 13

completed trial 1455 avg_r -0.8649923948198557 epsilon 0.2331662314668759
completed trial 1456 avg_r -0.9162009048648179 epsilon 0.23293318119733702
completed trial 1457 avg_r -0.8706193841621279 epsilon 0.23270036386216347
completed trial 1458 avg_r -0.803100111708045 epsilon 0.23246777922853654
completed trial 1459 avg_r -0.8086013079434633 epsilon 0.23223542706387068
completed trial 1460 avg_r -0.8011206018924714 epsilon 0.23200330713581238
completed trial 1461 avg_r -0.8200531107187271 epsilon 0.2317714192122407
completed trial 1462 avg_r -0.855314811244607 epsilon 0.23153976306126647
completed trial 1463 avg_r -0.8118946637958289 epsilon 0.23130833845123214
completed trial 1464 avg_r -0.8044047686457634 epsilon 0.2310771451507121
completed trial 1465 avg_r -0.8419155035167932 epsilon 0.2308461829285119
completed trial 1466 avg_r -0.9246239960938692 epsilon 0.2306154515536681
completed trial 1467 avg_r -0.802766981497407 epsilon 0.230384950795448
completed trial 1468 avg_r -0.81284

completed trial 1565 avg_r -0.8144605772197246 epsilon 0.20887821190462266
completed trial 1566 avg_r -0.8122491406649351 epsilon 0.2086694375753429
completed trial 1567 avg_r -0.8006942094862461 epsilon 0.2084608719165614
completed trial 1568 avg_r -0.8018099514394998 epsilon 0.20825251471971162
completed trial 1569 avg_r -0.804661040008068 epsilon 0.20804436577643523
completed trial 1570 avg_r -0.8043033120036125 epsilon 0.20783642487858206
completed trial 1571 avg_r -0.7691696102917195 epsilon 0.20762869181821034
completed trial 1572 avg_r -0.8283663410693407 epsilon 0.20742116638758587
completed trial 1573 avg_r -0.8781742002815008 epsilon 0.2072138483791821
completed trial 1574 avg_r -0.798300882279873 epsilon 0.20700673758568014
completed trial 1575 avg_r -0.891775746718049 epsilon 0.20679983379996808
completed trial 1576 avg_r -0.8314977081120014 epsilon 0.20659313681514122
completed trial 1577 avg_r -0.8600052324682474 epsilon 0.2063866464245013
completed trial 1578 avg_r -0.81

completed trial 1675 avg_r -0.9566511312639341 epsilon 0.18712018088549853
completed trial 1676 avg_r -0.8004742686450481 epsilon 0.1869331537661889
completed trial 1677 avg_r -0.8128487020730972 epsilon 0.1867463135809834
completed trial 1678 avg_r -0.8108028961718082 epsilon 0.18655966014304085
completed trial 1679 avg_r -0.8118227875232696 epsilon 0.18637319326570634
completed trial 1680 avg_r -0.7905615784227848 epsilon 0.18618691276251262
completed trial 1681 avg_r -0.8787711955234409 epsilon 0.18600081844717825
completed trial 1682 avg_r -0.8115438464283943 epsilon 0.18581491013360799
completed trial 1683 avg_r -0.8876508018001914 epsilon 0.1856291876358923
completed trial 1684 avg_r -0.8119224061071872 epsilon 0.1854436507683081
completed trial 1685 avg_r -0.8247712893784046 epsilon 0.18525829934531707
completed trial 1686 avg_r -0.7829426968097687 epsilon 0.18507313318156735
completed trial 1687 avg_r -0.8048403177410364 epsilon 0.18488815209189147
completed trial 1688 avg_r -0

completed trial 1785 avg_r -0.7885657978057862 epsilon 0.16762859934194563
completed trial 1786 avg_r -0.8128487020730972 epsilon 0.16746105411031711
completed trial 1787 avg_r -0.9154954160004855 epsilon 0.16729367634059386
completed trial 1788 avg_r -0.8125646743923426 epsilon 0.16712646586539742
completed trial 1789 avg_r -0.7966593277454376 epsilon 0.16695942251751653
completed trial 1790 avg_r -0.8016037978976965 epsilon 0.16679254612990702
completed trial 1791 avg_r -0.8226737085729837 epsilon 0.16662583653569132
completed trial 1792 avg_r -0.8064687198400498 epsilon 0.1664592935681592
completed trial 1793 avg_r -0.8022586941719055 epsilon 0.16629291706076677
completed trial 1794 avg_r -0.7977822268009186 epsilon 0.16612670684713654
completed trial 1795 avg_r -0.7929181914031506 epsilon 0.16596066276105773
completed trial 1796 avg_r -0.8389245373010635 epsilon 0.16579478463648556
completed trial 1797 avg_r -0.8089036480337382 epsilon 0.16562907230754076
completed trial 1798 avg_r

completed trial 1895 avg_r -0.8128487020730972 epsilon 0.15016738004617794
completed trial 1896 avg_r -0.812368751168251 epsilon 0.15001728734975459
completed trial 1897 avg_r -0.7891671687364579 epsilon 0.1498673446713813
completed trial 1898 avg_r -0.8340830660611391 epsilon 0.14971755186111443
completed trial 1899 avg_r -0.8032913976162672 epsilon 0.14956790876916062
completed trial 1900 avg_r -0.822819535061717 epsilon 0.14941841524587615
saving model
completed trial 1901 avg_r -0.813567244708538 epsilon 0.14926907114176616
completed trial 1902 avg_r -0.7852917612344027 epsilon 0.14911987630748616
completed trial 1903 avg_r -0.8128419410437345 epsilon 0.1489708305938406
completed trial 1904 avg_r -0.7945478864014148 epsilon 0.14882193385178275
completed trial 1905 avg_r -0.8315313860774041 epsilon 0.1486731859324153
completed trial 1906 avg_r -0.8101609194278717 epsilon 0.14852458668698962
completed trial 1907 avg_r -0.8407541986554861 epsilon 0.1483761359669055
completed trial 190

In [None]:
np.stack([np.array([[1,2],[3,4]]), np.array([[5,6],[7,8]])]).reshape(2,2,2)