# Simple q-learning agent with experience replay

We re-write q-learning algorithm using _agentnet_ - a helper for lasagne that implements some RL techniques.

In [1]:
! pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip

Collecting https://github.com/yandexdataschool/AgentNet/archive/master.zip
  Downloading https://github.com/yandexdataschool/AgentNet/archive/master.zip
[K     - 11.7MB 73.4MB/s
Requirement already up-to-date: six in ./anaconda3/envs/py34/lib/python3.4/site-packages (from agentnet==0.10.6)
Requirement already up-to-date: lasagne in ./AgentNet/src/lasagne (from agentnet==0.10.6)
Requirement already up-to-date: theano>=0.8.2 in ./anaconda3/envs/py34/lib/python3.4/site-packages (from agentnet==0.10.6)
Requirement already up-to-date: numpy>=1.9 in ./anaconda3/envs/py34/lib/python3.4/site-packages (from agentnet==0.10.6)
Requirement already up-to-date: scipy>=0.14 in ./anaconda3/envs/py34/lib/python3.4/site-packages (from theano>=0.8.2->agentnet==0.10.6)
Installing collected packages: agentnet
  Found existing installation: agentnet 0.10.6
    Uninstalling agentnet-0.10.6:
      Successfully uninstalled agentnet-0.10.6
  Running setup.py install for agentnet ... [?25l- \ | done
[?2

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
%env THEANO_FLAGS='floatX=float32'

#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

env: THEANO_FLAGS='floatX=float32'
bash: ../xvfb: No such file or directory
env: DISPLAY=:1


### Experiment setup
* Here we simply load the game and check that it works

In [3]:
import gym
make_env = lambda: gym.make("LunarLander-v2")

env = gym.make("LunarLander-v2")
env.reset()

state_shape = env.observation_space.shape
n_actions = env.action_space.n

In [4]:
print(state_shape)

(8,)


In [5]:
# plt.imshow(env.render("rgb_array"))
# del env

In [None]:
net = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_X)

# Neural Network body

In [6]:
import lasagne
from lasagne.layers import *
from lasagne.nonlinearities import *
# from lasagne.nonlinearities import relu


#image observation at current tick goes here, shape = (sample_i,x,y,color)
# inp_layer = InputLayer((None, 3, 8, 8))
observation_layer = InputLayer((None, ) + state_shape)

net = DenseLayer(observation_layer, 200, nonlinearity=elu)
net = DenseLayer(net, 200, nonlinearity=elu)
# nn = DenseLayer(nn, 200)

#a layer that predicts Qvalues
qvalues_layer = DenseLayer(net, num_units=n_actions,
                           nonlinearity=None,name="q-values")

Picking actions is done by yet another layer, that implements $ \epsilon$ -greedy policy

In [7]:
from agentnet.resolver import EpsilonGreedyResolver

In [8]:
action_layer = EpsilonGreedyResolver(qvalues_layer)

#set starting epsilon
action_layer.epsilon.set_value(np.float32(0.05))

### Agent

We define an agent entirely composed of a lasagne network:
* Observations as InputLayer(s)
* Actions as intermediate Layer(s)
* `policy_estimators` is "whatever else you want to keep track of"

Each parameter can be either one layer or a list of layers

In [9]:
from agentnet.agent import Agent
agent = Agent(observation_layers=observation_layer,
              action_layers=action_layer,
              policy_estimators=qvalues_layer,)

In [10]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params(action_layer,trainable=True)
weights

[W, b, W, b, q-values.W, q-values.b]

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf
* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

In [11]:
from agentnet.experiments.openai_gym.pool import EnvPool
pool = EnvPool(agent,make_env,n_games=1,max_size=10000)

In [12]:
%%time
#interact for 7 ticks
obs_log,action_log,reward_log,_,_,_  = pool.interact(5)


print('actions:',action_log)
print('rewards:',reward_log)

actions: [[1 1 3 1 0]]
rewards: [[-1.81280494 -2.10888041 -1.92483021 -2.06786121  0.        ]]
CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 23.7 ms


In [13]:
#we'll train on rollouts of 10 steps (required by n-step algorithms and rnns later)
SEQ_LENGTH=10

#load first sessions (this function calls interact and stores sessions in the pool)

for _ in range(100):
    pool.update(SEQ_LENGTH)

# q-learning

We shall now define a function that replays recent game sessions and updates network weights

In [14]:
#get agent's Qvalues obtained via experience replay
replay = pool.experience_replay.sample_session_batch(100)
qvalues_seq = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)[-1]



In [15]:
#loss for Qlearning = (Q(s,a) - (r+gamma*Q(s',a_max)))^2, like you implemented before in lasagne.

from agentnet.learning import qlearning
elwise_mse_loss = qlearning.get_elementwise_objective(qvalues_seq,
                                                      replay.actions[0],
                                                      replay.rewards,
                                                      replay.is_alive,
                                                      gamma_or_gammas=0.99,
                                                      n_steps=1,)

#compute mean loss over "alive" fragments
loss = elwise_mse_loss.sum() / replay.is_alive.sum()

In [16]:
#get weight updates
updates = lasagne.updates.adam(loss,weights,learning_rate=1e-4)

#compile train function
import theano
train_step = theano.function([],loss,updates=updates)

# Training loop

In [17]:
epoch_counter = 1 #starting epoch
rewards = {} #full game rewards
target_score = 0

In [18]:
from tqdm import trange

for i in trange(10500):    
    
    #play
    for _ in range(5):
        pool.update(SEQ_LENGTH,append=True)
    
    #train
    train_step()
    
    #update epsilon
    epsilon = 0.05 + 0.95*np.exp(-epoch_counter/1000.)
    action_layer.epsilon.set_value(np.float32(epsilon))
    
    #play a few games for evaluation
    if epoch_counter%50==0:
        rewards[epoch_counter] = np.mean(pool.evaluate(n_games=3,record_video=False))
        print("iter=%i\tepsilon=%.3f"%(epoch_counter,action_layer.epsilon.get_value(),))
        print("Current score(mean over %i) = %.3f"%(3,np.mean(rewards[epoch_counter])))
    
        if rewards[epoch_counter] >= target_score:
            print("You win!")
            break

    
    epoch_counter  +=1

    
# Time to drink some coffee!

  0%|          | 51/10500 [00:03<11:20, 15.34it/s]

Episode finished after 102 timesteps with reward=-130.88768128867397
Episode finished after 67 timesteps with reward=-116.56477484809187
Episode finished after 71 timesteps with reward=-213.61946112192328
iter=50	epsilon=0.954
Current score(mean over 3) = -153.691


  1%|          | 99/10500 [00:06<11:32, 15.02it/s]

Episode finished after 104 timesteps with reward=-137.68896482463805
Episode finished after 90 timesteps with reward=-183.00277077741316
Episode finished after 95 timesteps with reward=-153.6674467105105
iter=100	epsilon=0.910
Current score(mean over 3) = -158.120


  1%|▏         | 149/10500 [00:10<11:53, 14.50it/s]

Episode finished after 95 timesteps with reward=-114.20588988174838
Episode finished after 71 timesteps with reward=-90.41802863748242
Episode finished after 155 timesteps with reward=-165.10304188059453
iter=150	epsilon=0.868
Current score(mean over 3) = -123.242


  2%|▏         | 199/10500 [00:13<12:04, 14.22it/s]

Episode finished after 114 timesteps with reward=-269.37410033680396
Episode finished after 135 timesteps with reward=-381.17985578482694
Episode finished after 111 timesteps with reward=-443.375005226774
iter=200	epsilon=0.828
Current score(mean over 3) = -364.643


  2%|▏         | 251/10500 [00:18<12:17, 13.90it/s]

Episode finished after 109 timesteps with reward=-276.6580313958543
Episode finished after 86 timesteps with reward=-101.61188317915395
Episode finished after 88 timesteps with reward=-148.88936648249114
iter=250	epsilon=0.790
Current score(mean over 3) = -175.720


  3%|▎         | 299/10500 [00:21<12:10, 13.97it/s]

Episode finished after 108 timesteps with reward=-449.91712988712266
Episode finished after 137 timesteps with reward=-159.22290695690546
Episode finished after 68 timesteps with reward=-338.7122813185746
iter=300	epsilon=0.754
Current score(mean over 3) = -315.951


  3%|▎         | 349/10500 [00:25<12:12, 13.87it/s]

Episode finished after 101 timesteps with reward=-608.6480126051329
Episode finished after 111 timesteps with reward=-127.00555173939404
Episode finished after 138 timesteps with reward=-328.6144751731985
iter=350	epsilon=0.719
Current score(mean over 3) = -354.756


  4%|▍         | 399/10500 [00:28<12:13, 13.78it/s]

Episode finished after 101 timesteps with reward=-645.523081112053
Episode finished after 99 timesteps with reward=-527.6510064734589
Episode finished after 117 timesteps with reward=-524.444664308598
iter=400	epsilon=0.687
Current score(mean over 3) = -565.873


  4%|▍         | 449/10500 [00:32<12:15, 13.67it/s]

Episode finished after 84 timesteps with reward=-186.07252856796475
Episode finished after 81 timesteps with reward=-269.656439530188
Episode finished after 113 timesteps with reward=-161.34731834103417
iter=450	epsilon=0.656
Current score(mean over 3) = -205.692


  5%|▍         | 499/10500 [00:36<12:15, 13.59it/s]

Episode finished after 108 timesteps with reward=-221.18461981132077
Episode finished after 97 timesteps with reward=-291.4748208215174
Episode finished after 91 timesteps with reward=-326.86549655463375
iter=500	epsilon=0.626
Current score(mean over 3) = -279.842


  5%|▌         | 549/10500 [00:40<12:18, 13.47it/s]

Episode finished after 115 timesteps with reward=-386.2570126080831
Episode finished after 127 timesteps with reward=-297.50012005392415


  5%|▌         | 551/10500 [00:41<12:24, 13.37it/s]

Episode finished after 150 timesteps with reward=-212.84650422747748
iter=550	epsilon=0.598
Current score(mean over 3) = -298.868


  6%|▌         | 599/10500 [00:45<12:24, 13.29it/s]

Episode finished after 99 timesteps with reward=-255.50183097432352
Episode finished after 93 timesteps with reward=-215.31262414609068
Episode finished after 85 timesteps with reward=-287.7910123644183
iter=600	epsilon=0.571
Current score(mean over 3) = -252.868


  6%|▌         | 649/10500 [00:49<12:29, 13.14it/s]

Episode finished after 84 timesteps with reward=-334.2680047138192
Episode finished after 106 timesteps with reward=-587.3772631308314
Episode finished after 111 timesteps with reward=-403.8352679398143
iter=650	epsilon=0.546
Current score(mean over 3) = -441.827


  7%|▋         | 699/10500 [00:53<12:29, 13.07it/s]

Episode finished after 126 timesteps with reward=-678.9667645941489
Episode finished after 90 timesteps with reward=-234.48861655020661
Episode finished after 93 timesteps with reward=-266.99125616718675
iter=700	epsilon=0.522
Current score(mean over 3) = -393.482


  7%|▋         | 749/10500 [00:57<12:33, 12.94it/s]

Episode finished after 120 timesteps with reward=-239.88172734200168
Episode finished after 97 timesteps with reward=-328.76401959096745
Episode finished after 111 timesteps with reward=-489.68651016881245
iter=750	epsilon=0.499
Current score(mean over 3) = -352.777


  8%|▊         | 799/10500 [01:02<12:38, 12.78it/s]

Episode finished after 206 timesteps with reward=-477.7697455177769
Episode finished after 232 timesteps with reward=-340.68288568666327
Episode finished after 147 timesteps with reward=-322.16033178049327
iter=800	epsilon=0.477
Current score(mean over 3) = -380.204


  8%|▊         | 849/10500 [01:07<12:44, 12.63it/s]

Episode finished after 293 timesteps with reward=-533.1118296206812
Episode finished after 197 timesteps with reward=-315.1486964746071


  8%|▊         | 851/10500 [01:08<12:52, 12.50it/s]

Episode finished after 179 timesteps with reward=-485.9334193559152
iter=850	epsilon=0.456
Current score(mean over 3) = -444.731


  9%|▊         | 899/10500 [01:12<12:50, 12.46it/s]

Episode finished after 168 timesteps with reward=-536.1583719664573
Episode finished after 139 timesteps with reward=-365.06422236013043
Episode finished after 95 timesteps with reward=-342.6036061509101
iter=900	epsilon=0.436
Current score(mean over 3) = -414.609


  9%|▉         | 949/10500 [01:16<12:52, 12.37it/s]

Episode finished after 89 timesteps with reward=-272.80458868916867
Episode finished after 221 timesteps with reward=-405.1287970430761


  9%|▉         | 951/10500 [01:17<12:57, 12.28it/s]

Episode finished after 227 timesteps with reward=-369.0666134471166
iter=950	epsilon=0.417
Current score(mean over 3) = -349.000


 10%|▉         | 999/10500 [01:21<12:57, 12.22it/s]

Episode finished after 275 timesteps with reward=-447.19992792504263
Episode finished after 219 timesteps with reward=-623.5317448863908
Episode finished after 145 timesteps with reward=-419.4153833594113
iter=1000	epsilon=0.399
Current score(mean over 3) = -496.716


 10%|▉         | 1049/10500 [01:26<13:02, 12.08it/s]

Episode finished after 260 timesteps with reward=-683.6136332145094
Episode finished after 100 timesteps with reward=-658.6536140815915
Episode finished after 121 timesteps with reward=-342.4824484617713
iter=1050	epsilon=0.382
Current score(mean over 3) = -561.583


 10%|█         | 1099/10500 [01:31<13:03, 11.99it/s]

Episode finished after 231 timesteps with reward=-473.05394936033633
Episode finished after 173 timesteps with reward=-409.7419079935207


 10%|█         | 1101/10500 [01:32<13:09, 11.90it/s]

Episode finished after 224 timesteps with reward=-305.1599352300258
iter=1100	epsilon=0.366
Current score(mean over 3) = -395.985


 11%|█         | 1149/10500 [01:36<13:07, 11.87it/s]

Episode finished after 105 timesteps with reward=-383.12003351334516
Episode finished after 91 timesteps with reward=-463.591829384786


 11%|█         | 1151/10500 [01:37<13:10, 11.83it/s]

Episode finished after 132 timesteps with reward=-166.34168614179674
iter=1150	epsilon=0.351
Current score(mean over 3) = -337.685


 11%|█▏        | 1199/10500 [01:41<13:10, 11.77it/s]

Episode finished after 205 timesteps with reward=-71.58695406077909
Episode finished after 189 timesteps with reward=-355.57082550071016


 11%|█▏        | 1201/10500 [01:42<13:14, 11.71it/s]

Episode finished after 163 timesteps with reward=-558.9762539077758
iter=1200	epsilon=0.336
Current score(mean over 3) = -328.711


 12%|█▏        | 1248/10500 [01:47<13:13, 11.65it/s]

Episode finished after 398 timesteps with reward=-862.7053222008567
Episode finished after 166 timesteps with reward=-356.6397010064491


 12%|█▏        | 1251/10500 [01:48<13:21, 11.55it/s]

Episode finished after 278 timesteps with reward=-719.9280568763156
iter=1250	epsilon=0.322
Current score(mean over 3) = -646.424


 12%|█▏        | 1299/10500 [01:52<13:19, 11.51it/s]

Episode finished after 259 timesteps with reward=-709.632967381235
Episode finished after 310 timesteps with reward=-829.2331869323477
Episode finished after 213 timesteps with reward=-305.80641023645666
iter=1300	epsilon=0.309
Current score(mean over 3) = -614.891


 13%|█▎        | 1349/10500 [01:58<13:22, 11.41it/s]

Episode finished after 151 timesteps with reward=-268.9833130241776
Episode finished after 83 timesteps with reward=-533.4650202080487
Episode finished after 86 timesteps with reward=-347.0390088884277
iter=1350	epsilon=0.296
Current score(mean over 3) = -383.162


 13%|█▎        | 1399/10500 [02:02<13:19, 11.38it/s]

Episode finished after 178 timesteps with reward=-259.63649486613315
Episode finished after 202 timesteps with reward=-346.33343941390785


 13%|█▎        | 1401/10500 [02:03<13:22, 11.34it/s]

Episode finished after 136 timesteps with reward=-175.13945942508929
iter=1400	epsilon=0.284
Current score(mean over 3) = -260.370


 14%|█▍        | 1448/10500 [02:07<13:19, 11.32it/s]

Episode finished after 97 timesteps with reward=-268.5544710001011


 14%|█▍        | 1450/10500 [02:09<13:25, 11.24it/s]

Episode finished after 446 timesteps with reward=-717.9591537127667
Episode finished after 168 timesteps with reward=-253.9655413636565
iter=1450	epsilon=0.273
Current score(mean over 3) = -413.493


 14%|█▍        | 1499/10500 [02:13<13:21, 11.23it/s]

Episode finished after 220 timesteps with reward=-403.87891967625677
Episode finished after 384 timesteps with reward=-603.3233467359768
Episode finished after 142 timesteps with reward=-347.9840386537762
iter=1500	epsilon=0.262
Current score(mean over 3) = -451.729


 15%|█▍        | 1548/10500 [02:19<13:24, 11.12it/s]

Episode finished after 289 timesteps with reward=-416.04799121890466


 15%|█▍        | 1550/10500 [02:20<13:29, 11.06it/s]

Episode finished after 285 timesteps with reward=-653.4893350787759
Episode finished after 148 timesteps with reward=-275.35738316491035
iter=1550	epsilon=0.252
Current score(mean over 3) = -448.298


 15%|█▌        | 1598/10500 [02:24<13:25, 11.06it/s]

Episode finished after 224 timesteps with reward=-468.21889336068483
Episode finished after 105 timesteps with reward=-317.87706106169077


 15%|█▌        | 1602/10500 [02:25<13:29, 11.00it/s]

Episode finished after 353 timesteps with reward=-664.8312636253622
iter=1600	epsilon=0.242
Current score(mean over 3) = -483.642


 16%|█▌        | 1649/10500 [02:30<13:25, 10.98it/s]

Episode finished after 274 timesteps with reward=-534.2374800418806
Episode finished after 117 timesteps with reward=-221.9352691371992


 16%|█▌        | 1651/10500 [02:30<13:28, 10.94it/s]

Episode finished after 143 timesteps with reward=-594.1330741816514
iter=1650	epsilon=0.232
Current score(mean over 3) = -450.102


 16%|█▌        | 1699/10500 [02:35<13:25, 10.93it/s]

Episode finished after 239 timesteps with reward=-477.2439698205049
Episode finished after 308 timesteps with reward=-310.0748966115334
Episode finished after 213 timesteps with reward=-232.17024756571567
iter=1700	epsilon=0.224
Current score(mean over 3) = -339.830


 17%|█▋        | 1749/10500 [02:41<13:26, 10.85it/s]

Episode finished after 192 timesteps with reward=-440.0020336603283
Episode finished after 320 timesteps with reward=-353.5485374657319


 17%|█▋        | 1751/10500 [02:42<13:33, 10.75it/s]

Episode finished after 529 timesteps with reward=-566.2430124358157
iter=1750	epsilon=0.215
Current score(mean over 3) = -453.265


 17%|█▋        | 1799/10500 [02:48<13:32, 10.71it/s]

Episode finished after 345 timesteps with reward=-287.6494544429742


 17%|█▋        | 1801/10500 [02:49<13:37, 10.64it/s]

Episode finished after 316 timesteps with reward=-220.31792834680405
Episode finished after 68 timesteps with reward=-415.8008965119201
iter=1800	epsilon=0.207
Current score(mean over 3) = -307.923


 18%|█▊        | 1849/10500 [02:54<13:35, 10.60it/s]

Episode finished after 343 timesteps with reward=-637.0089858476547
Episode finished after 271 timesteps with reward=-556.5330440804485
Episode finished after 204 timesteps with reward=-537.3174006517855
iter=1850	epsilon=0.199
Current score(mean over 3) = -576.953


 18%|█▊        | 1899/10500 [03:00<13:36, 10.53it/s]

Episode finished after 201 timesteps with reward=-297.01018695337723
Episode finished after 265 timesteps with reward=-313.0712312821303


 18%|█▊        | 1901/10500 [03:01<13:40, 10.48it/s]

Episode finished after 290 timesteps with reward=-173.45445371281025
iter=1900	epsilon=0.192
Current score(mean over 3) = -261.179


 19%|█▊        | 1949/10500 [03:06<13:39, 10.44it/s]

Episode finished after 204 timesteps with reward=-423.6022348105554
Episode finished after 202 timesteps with reward=-239.56751004932244


 19%|█▊        | 1952/10500 [03:07<13:42, 10.39it/s]

Episode finished after 338 timesteps with reward=-476.77012150292023
iter=1950	epsilon=0.185
Current score(mean over 3) = -379.980


 19%|█▉        | 1999/10500 [03:13<13:41, 10.34it/s]

Episode finished after 231 timesteps with reward=-270.817255380206
Episode finished after 358 timesteps with reward=-211.4003651921288


 19%|█▉        | 2002/10500 [03:14<13:47, 10.27it/s]

Episode finished after 431 timesteps with reward=-235.52118210544307
iter=2000	epsilon=0.179
Current score(mean over 3) = -239.246


 20%|█▉        | 2049/10500 [03:19<13:44, 10.25it/s]

Episode finished after 197 timesteps with reward=-599.3134259279743
Episode finished after 81 timesteps with reward=-253.19744136438743
Episode finished after 80 timesteps with reward=-464.1940547546929
iter=2050	epsilon=0.172
Current score(mean over 3) = -438.902


 20%|█▉        | 2099/10500 [03:25<13:42, 10.21it/s]

Episode finished after 449 timesteps with reward=-325.04110499845353


 20%|██        | 2100/10500 [03:27<13:48, 10.14it/s]

Episode finished after 393 timesteps with reward=-186.53545297962108
Episode finished after 94 timesteps with reward=-317.51462845955615
iter=2100	epsilon=0.166
Current score(mean over 3) = -276.364


 20%|██        | 2149/10500 [03:33<13:48, 10.08it/s]

Episode finished after 164 timesteps with reward=-341.63222996988645


 20%|██        | 2150/10500 [03:34<13:54, 10.01it/s]

Episode finished after 730 timesteps with reward=-462.19761566925337
Episode finished after 199 timesteps with reward=-283.0836939418042
iter=2150	epsilon=0.161
Current score(mean over 3) = -362.305


 21%|██        | 2198/10500 [03:40<13:51,  9.98it/s]

Episode finished after 235 timesteps with reward=-297.2672422912757


 21%|██        | 2201/10500 [03:41<13:54,  9.94it/s]

Episode finished after 355 timesteps with reward=-782.4455954638083
Episode finished after 94 timesteps with reward=-393.9047481239197
iter=2200	epsilon=0.155
Current score(mean over 3) = -491.206


 21%|██▏       | 2249/10500 [03:46<13:52,  9.92it/s]

Episode finished after 424 timesteps with reward=-439.7629438924776


 21%|██▏       | 2250/10500 [03:47<13:55,  9.87it/s]

Episode finished after 257 timesteps with reward=-323.178451277078
Episode finished after 104 timesteps with reward=-483.6325735558219
iter=2250	epsilon=0.150
Current score(mean over 3) = -415.525


 22%|██▏       | 2298/10500 [03:53<13:53,  9.84it/s]

Episode finished after 216 timesteps with reward=-422.86616875361824
Episode finished after 288 timesteps with reward=-513.4219955172928


 22%|██▏       | 2302/10500 [03:54<13:56,  9.80it/s]

Episode finished after 280 timesteps with reward=-475.14719771467554
iter=2300	epsilon=0.145
Current score(mean over 3) = -470.478


 22%|██▏       | 2348/10500 [03:59<13:51,  9.81it/s]

Episode finished after 81 timesteps with reward=-479.3383559961085
Episode finished after 229 timesteps with reward=-356.2151610201501


 22%|██▏       | 2351/10500 [04:00<13:53,  9.78it/s]

Episode finished after 291 timesteps with reward=-279.4728733033532
iter=2350	epsilon=0.141
Current score(mean over 3) = -371.675


 23%|██▎       | 2399/10500 [04:05<13:49,  9.77it/s]

Episode finished after 328 timesteps with reward=-391.21791597652845
Episode finished after 306 timesteps with reward=-268.113337145822


 23%|██▎       | 2401/10500 [04:07<13:53,  9.71it/s]

Episode finished after 388 timesteps with reward=-458.604569290555
iter=2400	epsilon=0.136
Current score(mean over 3) = -372.645


 23%|██▎       | 2448/10500 [04:12<13:49,  9.70it/s]

Episode finished after 590 timesteps with reward=-512.2118286436648
Episode finished after 227 timesteps with reward=-371.41023497272914


 23%|██▎       | 2452/10500 [04:14<13:55,  9.63it/s]

Episode finished after 335 timesteps with reward=-625.9451039568239
iter=2450	epsilon=0.132
Current score(mean over 3) = -503.189


 24%|██▍       | 2499/10500 [04:19<13:51,  9.62it/s]

Episode finished after 104 timesteps with reward=-172.1326042421976
Episode finished after 364 timesteps with reward=-389.424153602574


 24%|██▍       | 2502/10500 [04:20<13:53,  9.59it/s]

Episode finished after 226 timesteps with reward=-264.3372229472993
iter=2500	epsilon=0.128
Current score(mean over 3) = -275.298


 24%|██▍       | 2548/10500 [04:25<13:49,  9.59it/s]

Episode finished after 84 timesteps with reward=-463.009571915876
Episode finished after 150 timesteps with reward=-377.6064131662051


 24%|██▍       | 2551/10500 [04:26<13:49,  9.58it/s]

Episode finished after 118 timesteps with reward=-448.23010119156606
iter=2550	epsilon=0.124
Current score(mean over 3) = -429.615


 25%|██▍       | 2599/10500 [04:31<13:45,  9.57it/s]

Episode finished after 219 timesteps with reward=-442.5987021833759


 25%|██▍       | 2600/10500 [04:32<13:47,  9.55it/s]

Episode finished after 247 timesteps with reward=-239.65142758049606
Episode finished after 174 timesteps with reward=-240.79604142022228
iter=2600	epsilon=0.121
Current score(mean over 3) = -307.682


 25%|██▌       | 2648/10500 [04:37<13:42,  9.55it/s]

Episode finished after 269 timesteps with reward=-518.9625302169213
Episode finished after 335 timesteps with reward=-235.05299343807465


 25%|██▌       | 2651/10500 [04:38<13:44,  9.52it/s]

Episode finished after 205 timesteps with reward=-367.76037696952005
iter=2650	epsilon=0.117
Current score(mean over 3) = -373.925


 26%|██▌       | 2699/10500 [04:43<13:40,  9.51it/s]

Episode finished after 97 timesteps with reward=-221.94972348694046
Episode finished after 116 timesteps with reward=-81.28848599327937
Episode finished after 81 timesteps with reward=-302.35500485686896
iter=2700	epsilon=0.114
Current score(mean over 3) = -201.864


 26%|██▌       | 2748/10500 [04:48<13:34,  9.51it/s]

Episode finished after 285 timesteps with reward=-262.60371617155204
Episode finished after 141 timesteps with reward=-305.2927547852837


 26%|██▌       | 2752/10500 [04:49<13:36,  9.49it/s]

Episode finished after 254 timesteps with reward=-365.1478852014667
iter=2750	epsilon=0.111
Current score(mean over 3) = -311.015


 27%|██▋       | 2799/10500 [04:54<13:30,  9.50it/s]

Episode finished after 92 timesteps with reward=-136.53289291275354
Episode finished after 81 timesteps with reward=-374.40071146009376


 27%|██▋       | 2801/10500 [04:55<13:30,  9.49it/s]

Episode finished after 174 timesteps with reward=-39.15248813987844
iter=2800	epsilon=0.108
Current score(mean over 3) = -183.362


 27%|██▋       | 2848/10500 [04:59<13:25,  9.50it/s]

Episode finished after 212 timesteps with reward=-369.4758432052097
Episode finished after 149 timesteps with reward=-424.02239806799525


 27%|██▋       | 2852/10500 [05:00<13:26,  9.49it/s]

Episode finished after 128 timesteps with reward=-268.2201897538333
iter=2850	epsilon=0.105
Current score(mean over 3) = -353.906


 28%|██▊       | 2899/10500 [05:05<13:20,  9.50it/s]

Episode finished after 144 timesteps with reward=-345.731695581939
Episode finished after 89 timesteps with reward=-354.5500330247613
Episode finished after 92 timesteps with reward=-401.8651076779268
iter=2900	epsilon=0.102
Current score(mean over 3) = -367.382


 28%|██▊       | 2951/10500 [05:10<13:15,  9.49it/s]

Episode finished after 181 timesteps with reward=-197.9616526418966
Episode finished after 54 timesteps with reward=-318.6904126215985
Episode finished after 55 timesteps with reward=-354.7410976361251
iter=2950	epsilon=0.100
Current score(mean over 3) = -290.464


 29%|██▊       | 2999/10500 [05:15<13:09,  9.51it/s]

Episode finished after 238 timesteps with reward=-532.1217319775612
Episode finished after 232 timesteps with reward=-602.307515950426


 29%|██▊       | 3002/10500 [05:16<13:10,  9.48it/s]

Episode finished after 295 timesteps with reward=-386.7700440435541
iter=3000	epsilon=0.097
Current score(mean over 3) = -507.066


 29%|██▉       | 3049/10500 [05:21<13:06,  9.48it/s]

Episode finished after 309 timesteps with reward=-230.38099428564632
Episode finished after 162 timesteps with reward=-405.71992827482177


 29%|██▉       | 3051/10500 [05:22<13:07,  9.45it/s]

Episode finished after 305 timesteps with reward=-270.13128935198506
iter=3050	epsilon=0.095
Current score(mean over 3) = -302.077


 30%|██▉       | 3099/10500 [05:27<13:01,  9.47it/s]

Episode finished after 98 timesteps with reward=-391.87039144904935
Episode finished after 273 timesteps with reward=-579.7996129765563
Episode finished after 139 timesteps with reward=-395.16921016270953
iter=3100	epsilon=0.093
Current score(mean over 3) = -455.613


 30%|██▉       | 3149/10500 [05:32<12:57,  9.46it/s]

Episode finished after 221 timesteps with reward=-285.3504076013411


 30%|███       | 3150/10500 [05:33<12:58,  9.44it/s]

Episode finished after 235 timesteps with reward=-396.7498066058906
Episode finished after 138 timesteps with reward=-442.21284131285745
iter=3150	epsilon=0.091
Current score(mean over 3) = -374.771


 30%|███       | 3199/10500 [05:38<12:52,  9.45it/s]

Episode finished after 173 timesteps with reward=-408.5942975802257
Episode finished after 320 timesteps with reward=-305.41602278065204


 30%|███       | 3201/10500 [05:39<12:53,  9.43it/s]

Episode finished after 202 timesteps with reward=-469.7251396775598
iter=3200	epsilon=0.089
Current score(mean over 3) = -394.578


 31%|███       | 3248/10500 [05:43<12:47,  9.45it/s]

Episode finished after 154 timesteps with reward=-483.0235277234428


 31%|███       | 3250/10500 [05:44<12:48,  9.43it/s]

Episode finished after 248 timesteps with reward=-467.34386903023835
Episode finished after 193 timesteps with reward=-327.050021376785
iter=3250	epsilon=0.087
Current score(mean over 3) = -425.806


 31%|███▏      | 3298/10500 [05:49<12:43,  9.44it/s]

Episode finished after 82 timesteps with reward=-476.3832686787316


 31%|███▏      | 3300/10500 [05:49<12:43,  9.43it/s]

Episode finished after 222 timesteps with reward=-434.7238493944217
Episode finished after 91 timesteps with reward=-405.91075314908295
iter=3300	epsilon=0.085
Current score(mean over 3) = -439.006


 32%|███▏      | 3348/10500 [05:54<12:37,  9.45it/s]

Episode finished after 139 timesteps with reward=-348.41851812352917
Episode finished after 107 timesteps with reward=-85.9405506591921


 32%|███▏      | 3352/10500 [05:55<12:37,  9.44it/s]

Episode finished after 142 timesteps with reward=-205.6137500304668
iter=3350	epsilon=0.083
Current score(mean over 3) = -213.324


 32%|███▏      | 3400/10500 [06:00<12:31,  9.44it/s]

Episode finished after 177 timesteps with reward=-236.3470386622874
Episode finished after 96 timesteps with reward=-166.09542185415054
Episode finished after 104 timesteps with reward=-156.62572218461798
iter=3400	epsilon=0.082
Current score(mean over 3) = -186.356


 33%|███▎      | 3448/10500 [06:04<12:25,  9.46it/s]

Episode finished after 132 timesteps with reward=-393.08380659665823
Episode finished after 195 timesteps with reward=-249.86974715898566


 33%|███▎      | 3452/10500 [06:05<12:25,  9.45it/s]

Episode finished after 135 timesteps with reward=-674.2087111613616
iter=3450	epsilon=0.080
Current score(mean over 3) = -439.054


 33%|███▎      | 3499/10500 [06:09<12:19,  9.47it/s]

Episode finished after 236 timesteps with reward=-336.9070789816704
Episode finished after 192 timesteps with reward=-155.12777056595803


 33%|███▎      | 3501/10500 [06:10<12:20,  9.45it/s]

Episode finished after 185 timesteps with reward=-479.6788181818821
iter=3500	epsilon=0.079
Current score(mean over 3) = -323.905


 34%|███▍      | 3548/10500 [06:14<12:14,  9.47it/s]

Episode finished after 208 timesteps with reward=-218.79401098439493
Episode finished after 137 timesteps with reward=-317.03900253008146


 34%|███▍      | 3551/10500 [06:15<12:15,  9.45it/s]

Episode finished after 169 timesteps with reward=-268.52848828252905
iter=3550	epsilon=0.077
Current score(mean over 3) = -268.121


 34%|███▍      | 3598/10500 [06:19<12:08,  9.47it/s]

Episode finished after 113 timesteps with reward=-19.72315338492669
Episode finished after 156 timesteps with reward=-35.15005996873131


 34%|███▍      | 3602/10500 [06:20<12:08,  9.46it/s]

Episode finished after 90 timesteps with reward=-416.14815243846147
iter=3600	epsilon=0.076
Current score(mean over 3) = -157.007


 35%|███▍      | 3648/10500 [06:24<12:02,  9.48it/s]

Episode finished after 74 timesteps with reward=-496.2500319868068
Episode finished after 152 timesteps with reward=-131.29467227296286


 35%|███▍      | 3652/10500 [06:25<12:03,  9.47it/s]

Episode finished after 136 timesteps with reward=-526.5057765473496
iter=3650	epsilon=0.075
Current score(mean over 3) = -384.683


 35%|███▌      | 3698/10500 [06:29<11:57,  9.48it/s]

Episode finished after 86 timesteps with reward=-254.65640666544965
Episode finished after 137 timesteps with reward=-358.5101789703584


 35%|███▌      | 3702/10500 [06:30<11:57,  9.48it/s]

Episode finished after 144 timesteps with reward=-380.55909946565254
iter=3700	epsilon=0.073
Current score(mean over 3) = -331.242


 36%|███▌      | 3750/10500 [06:35<11:51,  9.48it/s]

Episode finished after 149 timesteps with reward=-389.63916256489074
Episode finished after 65 timesteps with reward=-460.33588401047143
Episode finished after 89 timesteps with reward=-407.3699983042478
iter=3750	epsilon=0.072
Current score(mean over 3) = -419.115


 36%|███▌      | 3799/10500 [06:40<11:45,  9.49it/s]

Episode finished after 93 timesteps with reward=-81.50094883155829
Episode finished after 135 timesteps with reward=-94.7137337908085


 36%|███▌      | 3802/10500 [06:40<11:46,  9.49it/s]

Episode finished after 198 timesteps with reward=-510.70494319604285
iter=3800	epsilon=0.071
Current score(mean over 3) = -228.973


 37%|███▋      | 3850/10500 [06:45<11:40,  9.49it/s]

Episode finished after 77 timesteps with reward=-662.8992229311615
Episode finished after 89 timesteps with reward=-740.6499603710173
Episode finished after 87 timesteps with reward=-225.85156581205217
iter=3850	epsilon=0.070
Current score(mean over 3) = -543.134


 37%|███▋      | 3898/10500 [06:49<11:34,  9.51it/s]

Episode finished after 77 timesteps with reward=-481.0140905512356
Episode finished after 198 timesteps with reward=-311.58147532092437


 37%|███▋      | 3902/10500 [06:50<11:34,  9.50it/s]

Episode finished after 101 timesteps with reward=-69.2125875483678
iter=3900	epsilon=0.069
Current score(mean over 3) = -287.269


 38%|███▊      | 3949/10500 [06:54<11:28,  9.52it/s]

Episode finished after 144 timesteps with reward=-433.21594250036674
Episode finished after 77 timesteps with reward=-391.16296055082813
Episode finished after 66 timesteps with reward=-349.42260259235786
iter=3950	epsilon=0.068
Current score(mean over 3) = -391.267


 38%|███▊      | 3999/10500 [06:59<11:22,  9.53it/s]

Episode finished after 121 timesteps with reward=-236.81885212000907
Episode finished after 82 timesteps with reward=-309.6768818919974
Episode finished after 89 timesteps with reward=-374.4055442937997
iter=4000	epsilon=0.067
Current score(mean over 3) = -306.967


 39%|███▊      | 4049/10500 [07:04<11:16,  9.54it/s]

Episode finished after 161 timesteps with reward=-460.7569990569969
Episode finished after 112 timesteps with reward=-300.91837863028536
Episode finished after 92 timesteps with reward=-202.21252406719213
iter=4050	epsilon=0.067
Current score(mean over 3) = -321.296


 39%|███▉      | 4098/10500 [07:09<11:11,  9.54it/s]

Episode finished after 74 timesteps with reward=-229.2328479108419
Episode finished after 84 timesteps with reward=-257.4963498679483


 39%|███▉      | 4101/10500 [07:10<11:11,  9.53it/s]

Episode finished after 198 timesteps with reward=-371.03094293531484
iter=4100	epsilon=0.066
Current score(mean over 3) = -285.920


 40%|███▉      | 4148/10500 [07:14<11:05,  9.55it/s]

Episode finished after 104 timesteps with reward=-317.0477047271323
Episode finished after 124 timesteps with reward=-156.68222954511123


 40%|███▉      | 4152/10500 [07:15<11:05,  9.54it/s]

Episode finished after 163 timesteps with reward=-167.76028968290728
iter=4150	epsilon=0.065
Current score(mean over 3) = -213.830


 40%|████      | 4200/10500 [07:19<10:59,  9.56it/s]

Episode finished after 106 timesteps with reward=-335.22111709455464
Episode finished after 68 timesteps with reward=-565.2508313486733
Episode finished after 110 timesteps with reward=-440.4202683814498
iter=4200	epsilon=0.064
Current score(mean over 3) = -446.964


 40%|████      | 4250/10500 [07:24<10:53,  9.57it/s]

Episode finished after 83 timesteps with reward=-505.4553629543176
Episode finished after 99 timesteps with reward=-615.543450652437
Episode finished after 127 timesteps with reward=-477.7171972664348
iter=4250	epsilon=0.064
Current score(mean over 3) = -532.905


 41%|████      | 4300/10500 [07:29<10:47,  9.57it/s]

Episode finished after 111 timesteps with reward=-122.39062003251664
Episode finished after 57 timesteps with reward=-329.70137178024856
Episode finished after 109 timesteps with reward=-136.22996024931336
iter=4300	epsilon=0.063
Current score(mean over 3) = -196.107


 41%|████▏     | 4348/10500 [07:33<10:41,  9.59it/s]

Episode finished after 138 timesteps with reward=-498.3697686704397
Episode finished after 72 timesteps with reward=-162.54809082933951


 41%|████▏     | 4352/10500 [07:34<10:41,  9.58it/s]

Episode finished after 247 timesteps with reward=-816.8364722005374
iter=4350	epsilon=0.062
Current score(mean over 3) = -492.585


 42%|████▏     | 4400/10500 [07:38<10:36,  9.59it/s]

Episode finished after 171 timesteps with reward=-323.3136098883955
Episode finished after 130 timesteps with reward=-419.9972990567874
Episode finished after 69 timesteps with reward=-362.2994401215357
iter=4400	epsilon=0.062
Current score(mean over 3) = -368.537


 42%|████▏     | 4449/10500 [07:43<10:30,  9.60it/s]

Episode finished after 114 timesteps with reward=-138.63910386854565
Episode finished after 161 timesteps with reward=-258.2132677234074
Episode finished after 55 timesteps with reward=-344.804296716676
iter=4450	epsilon=0.061
Current score(mean over 3) = -247.219


 43%|████▎     | 4499/10500 [07:48<10:24,  9.61it/s]

Episode finished after 216 timesteps with reward=-578.4524035508621
Episode finished after 111 timesteps with reward=-328.29089454347746
Episode finished after 63 timesteps with reward=-260.5014828714484
iter=4500	epsilon=0.061
Current score(mean over 3) = -389.082


 43%|████▎     | 4549/10500 [07:53<10:18,  9.62it/s]

Episode finished after 239 timesteps with reward=-608.0792714884815
Episode finished after 173 timesteps with reward=-308.76794722158013


 43%|████▎     | 4551/10500 [07:53<10:19,  9.60it/s]

Episode finished after 182 timesteps with reward=-237.63016274214877
iter=4550	epsilon=0.060
Current score(mean over 3) = -384.826


 44%|████▍     | 4599/10500 [07:58<10:13,  9.62it/s]

Episode finished after 60 timesteps with reward=-264.5659330394849
Episode finished after 128 timesteps with reward=-475.69618324285943
Episode finished after 69 timesteps with reward=-220.0495451136035
iter=4600	epsilon=0.060
Current score(mean over 3) = -320.104


 44%|████▍     | 4650/10500 [08:03<10:07,  9.62it/s]

Episode finished after 113 timesteps with reward=-164.70017014089763
Episode finished after 65 timesteps with reward=-323.0001524449129
Episode finished after 127 timesteps with reward=-279.91820278501507
iter=4650	epsilon=0.059
Current score(mean over 3) = -255.873


 45%|████▍     | 4698/10500 [08:07<10:02,  9.63it/s]

Episode finished after 200 timesteps with reward=-604.9743996706588
Episode finished after 94 timesteps with reward=-169.93299968140786


 45%|████▍     | 4702/10500 [08:08<10:02,  9.63it/s]

Episode finished after 164 timesteps with reward=-386.703523324256
iter=4700	epsilon=0.059
Current score(mean over 3) = -387.204


 45%|████▌     | 4749/10500 [08:12<09:56,  9.64it/s]

Episode finished after 204 timesteps with reward=-361.59351409895675
Episode finished after 122 timesteps with reward=-300.2109045439383
Episode finished after 100 timesteps with reward=-162.33467212213685
iter=4750	epsilon=0.058
Current score(mean over 3) = -274.713


 46%|████▌     | 4799/10500 [08:17<09:51,  9.64it/s]

Episode finished after 235 timesteps with reward=-729.3758802042634
Episode finished after 170 timesteps with reward=-397.22013632467684


 46%|████▌     | 4801/10500 [08:18<09:51,  9.63it/s]

Episode finished after 251 timesteps with reward=-379.910662694263
iter=4800	epsilon=0.058
Current score(mean over 3) = -502.169


 46%|████▌     | 4849/10500 [08:22<09:46,  9.64it/s]

Episode finished after 154 timesteps with reward=-499.5604946805315
Episode finished after 156 timesteps with reward=-143.94310723385894
Episode finished after 76 timesteps with reward=-101.126350063747
iter=4850	epsilon=0.057
Current score(mean over 3) = -248.210


 47%|████▋     | 4899/10500 [08:27<09:40,  9.65it/s]

Episode finished after 150 timesteps with reward=-308.740391720508
Episode finished after 81 timesteps with reward=-455.6228773016912
Episode finished after 78 timesteps with reward=-306.04199027719176
iter=4900	epsilon=0.057
Current score(mean over 3) = -356.802


 47%|████▋     | 4949/10500 [08:32<09:34,  9.66it/s]

Episode finished after 119 timesteps with reward=-335.87801976822686
Episode finished after 94 timesteps with reward=-198.8376707424934
Episode finished after 76 timesteps with reward=-146.00437523511482
iter=4950	epsilon=0.057
Current score(mean over 3) = -226.907


 48%|████▊     | 4999/10500 [08:37<09:29,  9.67it/s]

Episode finished after 314 timesteps with reward=-452.2994158144447
Episode finished after 185 timesteps with reward=-468.03422906120306


 48%|████▊     | 5001/10500 [08:37<09:29,  9.66it/s]

Episode finished after 92 timesteps with reward=-135.11329651401556
iter=5000	epsilon=0.056
Current score(mean over 3) = -351.816


 48%|████▊     | 5049/10500 [08:42<09:23,  9.67it/s]

Episode finished after 83 timesteps with reward=-235.42356709275455
Episode finished after 85 timesteps with reward=11.8855714716926
Episode finished after 106 timesteps with reward=-333.98519785369353
iter=5050	epsilon=0.056
Current score(mean over 3) = -185.841


 49%|████▊     | 5099/10500 [08:46<09:18,  9.68it/s]

Episode finished after 148 timesteps with reward=-368.0433225779762
Episode finished after 92 timesteps with reward=-300.43983615985405
Episode finished after 83 timesteps with reward=-126.4608095032547
iter=5100	epsilon=0.056
Current score(mean over 3) = -264.981


 49%|████▉     | 5149/10500 [08:51<09:12,  9.68it/s]

Episode finished after 139 timesteps with reward=-353.4584387390498
Episode finished after 133 timesteps with reward=-312.31024852705264


 49%|████▉     | 5151/10500 [08:52<09:12,  9.68it/s]

Episode finished after 131 timesteps with reward=-388.4404061750373
iter=5150	epsilon=0.056
Current score(mean over 3) = -351.403


 50%|████▉     | 5199/10500 [08:56<09:07,  9.69it/s]

Episode finished after 82 timesteps with reward=-135.93889414053524
Episode finished after 127 timesteps with reward=-417.3395577806504
Episode finished after 106 timesteps with reward=-172.0265720373443
iter=5200	epsilon=0.055
Current score(mean over 3) = -241.768


 50%|█████     | 5250/10500 [09:01<09:01,  9.69it/s]

Episode finished after 103 timesteps with reward=-294.3708797175361
Episode finished after 141 timesteps with reward=-200.17282528596814
Episode finished after 84 timesteps with reward=-124.16968077532171
iter=5250	epsilon=0.055
Current score(mean over 3) = -206.238


 50%|█████     | 5299/10500 [09:06<08:56,  9.70it/s]

Episode finished after 110 timesteps with reward=-86.20072290869099
Episode finished after 103 timesteps with reward=-256.5292856453566
Episode finished after 113 timesteps with reward=-257.2069605894461
iter=5300	epsilon=0.055
Current score(mean over 3) = -199.979


 51%|█████     | 5350/10500 [09:11<08:51,  9.70it/s]

Episode finished after 233 timesteps with reward=-219.54466640105653
Episode finished after 81 timesteps with reward=-74.81033690477898
Episode finished after 83 timesteps with reward=-92.60304154787569
iter=5350	epsilon=0.055
Current score(mean over 3) = -128.986


 51%|█████▏    | 5399/10500 [09:16<08:45,  9.71it/s]

Episode finished after 114 timesteps with reward=-253.06581391050986
Episode finished after 95 timesteps with reward=-237.1954871201315
Episode finished after 69 timesteps with reward=-258.16625215195165
iter=5400	epsilon=0.054
Current score(mean over 3) = -249.476


 52%|█████▏    | 5449/10500 [09:20<08:39,  9.72it/s]

Episode finished after 73 timesteps with reward=-252.37661027529995
Episode finished after 72 timesteps with reward=-234.7982097510863
Episode finished after 144 timesteps with reward=-256.7225568956411
iter=5450	epsilon=0.054
Current score(mean over 3) = -247.966


 52%|█████▏    | 5499/10500 [09:25<08:34,  9.72it/s]

Episode finished after 128 timesteps with reward=-227.01550581824733
Episode finished after 99 timesteps with reward=-145.04128890979052
Episode finished after 145 timesteps with reward=-188.1802920054505
iter=5500	epsilon=0.054
Current score(mean over 3) = -186.746


 53%|█████▎    | 5549/10500 [09:30<08:28,  9.73it/s]

Episode finished after 120 timesteps with reward=-150.4452859975167
Episode finished after 172 timesteps with reward=-251.38814973788382


 53%|█████▎    | 5551/10500 [09:30<08:28,  9.72it/s]

Episode finished after 173 timesteps with reward=-145.3792132879727
iter=5550	epsilon=0.054
Current score(mean over 3) = -182.404


 53%|█████▎    | 5599/10500 [09:35<08:23,  9.74it/s]

Episode finished after 107 timesteps with reward=-290.82502581707223
Episode finished after 165 timesteps with reward=-291.18717132531606
Episode finished after 69 timesteps with reward=-117.10953458149477
iter=5600	epsilon=0.054

 53%|█████▎    | 5601/10500 [09:35<08:23,  9.73it/s]


Current score(mean over 3) = -233.041


 54%|█████▍    | 5650/10500 [09:40<08:18,  9.73it/s]

Episode finished after 134 timesteps with reward=-196.30785354549784
Episode finished after 113 timesteps with reward=-182.85896903854956
Episode finished after 129 timesteps with reward=-171.7905264413105
iter=5650	epsilon=0.053
Current score(mean over 3) = -183.652


 54%|█████▍    | 5698/10500 [09:44<08:12,  9.74it/s]

Episode finished after 133 timesteps with reward=-149.28331485497813
Episode finished after 92 timesteps with reward=-114.94785134918179


 54%|█████▍    | 5702/10500 [09:45<08:12,  9.74it/s]

Episode finished after 165 timesteps with reward=-153.33651064857406
iter=5700	epsilon=0.053
Current score(mean over 3) = -139.189


 55%|█████▍    | 5748/10500 [09:49<08:07,  9.75it/s]

Episode finished after 231 timesteps with reward=-315.56931739812114
Episode finished after 201 timesteps with reward=-176.9396247854308


 55%|█████▍    | 5752/10500 [09:50<08:07,  9.74it/s]

Episode finished after 202 timesteps with reward=-268.86078504527495
iter=5750	epsilon=0.053
Current score(mean over 3) = -253.790


 55%|█████▌    | 5798/10500 [09:55<08:02,  9.74it/s]

Episode finished after 142 timesteps with reward=-222.77628434204257
Episode finished after 146 timesteps with reward=-97.81057639568533


 55%|█████▌    | 5800/10500 [09:55<08:02,  9.74it/s]

Episode finished after 184 timesteps with reward=-209.6377705156603
iter=5800	epsilon=0.053
Current score(mean over 3) = -176.742


 56%|█████▌    | 5849/10500 [10:00<07:57,  9.74it/s]

Episode finished after 136 timesteps with reward=-137.25359649203406
Episode finished after 176 timesteps with reward=-104.25075900470372


 56%|█████▌    | 5851/10500 [10:00<07:57,  9.74it/s]

Episode finished after 159 timesteps with reward=-120.57175160231131
iter=5850	epsilon=0.053
Current score(mean over 3) = -120.692


 56%|█████▌    | 5899/10500 [10:05<07:52,  9.74it/s]

Episode finished after 262 timesteps with reward=-87.14267481524176
Episode finished after 171 timesteps with reward=-158.3631758172506


 56%|█████▌    | 5901/10500 [10:06<07:52,  9.73it/s]

Episode finished after 293 timesteps with reward=-137.98485088377933
iter=5900	epsilon=0.053
Current score(mean over 3) = -127.830


 57%|█████▋    | 5948/10500 [10:10<07:47,  9.74it/s]

Episode finished after 179 timesteps with reward=-6.544825332314716


 57%|█████▋    | 5950/10500 [10:11<07:47,  9.73it/s]

Episode finished after 242 timesteps with reward=-212.43678948314877
Episode finished after 181 timesteps with reward=-91.33454947009325
iter=5950	epsilon=0.052
Current score(mean over 3) = -103.439


 57%|█████▋    | 5999/10500 [10:16<07:42,  9.73it/s]

Episode finished after 154 timesteps with reward=-244.50164013789285
Episode finished after 237 timesteps with reward=-244.86071201655915


 57%|█████▋    | 6002/10500 [10:17<07:42,  9.72it/s]

Episode finished after 341 timesteps with reward=-276.98034321232984
iter=6000	epsilon=0.052
Current score(mean over 3) = -255.448


 58%|█████▊    | 6048/10500 [10:21<07:37,  9.73it/s]

Episode finished after 193 timesteps with reward=-302.3478846527085


 58%|█████▊    | 6050/10500 [10:22<07:38,  9.72it/s]

Episode finished after 262 timesteps with reward=-264.6796306891145
Episode finished after 169 timesteps with reward=-290.8261125135651
iter=6050	epsilon=0.052
Current score(mean over 3) = -285.951


 58%|█████▊    | 6099/10500 [10:27<07:32,  9.72it/s]

Episode finished after 186 timesteps with reward=-324.7142389795994
Episode finished after 171 timesteps with reward=-278.6080984488649


 58%|█████▊    | 6101/10500 [10:27<07:32,  9.72it/s]

Episode finished after 153 timesteps with reward=-347.3542360530221
iter=6100	epsilon=0.052
Current score(mean over 3) = -316.892


 59%|█████▊    | 6149/10500 [10:32<07:27,  9.73it/s]

Episode finished after 138 timesteps with reward=-408.448975113186
Episode finished after 144 timesteps with reward=-406.5610368247249
Episode finished after 97 timesteps with reward=-375.29656222451405
iter=6150	epsilon=0.052
Current score(mean over 3) = -396.769


 59%|█████▉    | 6199/10500 [10:37<07:22,  9.73it/s]

Episode finished after 214 timesteps with reward=-358.1369582824507
Episode finished after 98 timesteps with reward=-267.21382212156016
Episode finished after 103 timesteps with reward=-277.1213525585729
iter=6200	epsilon=0.052
Current score(mean over 3) = -300.824


 60%|█████▉    | 6249/10500 [10:41<07:16,  9.73it/s]

Episode finished after 102 timesteps with reward=-309.2668031719329
Episode finished after 125 timesteps with reward=-363.6566286469449
Episode finished after 107 timesteps with reward=-289.2665546319528
iter=6250	epsilon=0.052
Current score(mean over 3) = -320.730


 60%|█████▉    | 6299/10500 [10:46<07:11,  9.74it/s]

Episode finished after 135 timesteps with reward=-201.42749632341554
Episode finished after 126 timesteps with reward=-228.33216506458203


 60%|██████    | 6301/10500 [10:47<07:11,  9.73it/s]

Episode finished after 204 timesteps with reward=-384.6591855564251
iter=6300	epsilon=0.052
Current score(mean over 3) = -271.473


 60%|██████    | 6348/10500 [10:51<07:06,  9.74it/s]

Episode finished after 133 timesteps with reward=-303.78125823353025
Episode finished after 183 timesteps with reward=-132.39217519556755


 60%|██████    | 6352/10500 [10:52<07:05,  9.74it/s]

Episode finished after 110 timesteps with reward=-309.5610708680854
iter=6350	epsilon=0.052
Current score(mean over 3) = -248.578


 61%|██████    | 6399/10500 [10:56<07:00,  9.75it/s]

Episode finished after 106 timesteps with reward=-286.94554540937474
Episode finished after 131 timesteps with reward=-262.0945739794014
Episode finished after 113 timesteps with reward=-334.39578790579174
iter=6400	epsilon=0.052
Current score(mean over 3) = -294.479


 61%|██████▏   | 6450/10500 [11:01<06:55,  9.75it/s]

Episode finished after 197 timesteps with reward=-173.691110567212
Episode finished after 112 timesteps with reward=-270.63181807705587
Episode finished after 106 timesteps with reward=-303.86479089548095
iter=6450	epsilon=0.052
Current score(mean over 3) = -249.396


 62%|██████▏   | 6500/10500 [11:06<06:50,  9.75it/s]

Episode finished after 112 timesteps with reward=-271.7471070254645
Episode finished after 103 timesteps with reward=-270.53984465949094
Episode finished after 101 timesteps with reward=-212.07742466268445
iter=6500	epsilon=0.051
Current score(mean over 3) = -251.455


 62%|██████▏   | 6548/10500 [11:11<06:45,  9.76it/s]

Episode finished after 100 timesteps with reward=-292.42268858023976
Episode finished after 193 timesteps with reward=-120.3364583534198


 62%|██████▏   | 6552/10500 [11:11<06:44,  9.75it/s]

Episode finished after 109 timesteps with reward=-294.2203407620516
iter=6550	epsilon=0.051
Current score(mean over 3) = -235.660


 63%|██████▎   | 6600/10500 [11:16<06:39,  9.76it/s]

Episode finished after 97 timesteps with reward=-236.29549377576419
Episode finished after 113 timesteps with reward=-290.7597698785199
Episode finished after 108 timesteps with reward=-270.9107502813474
iter=6600	epsilon=0.051
Current score(mean over 3) = -265.989


 63%|██████▎   | 6648/10500 [11:20<06:34,  9.77it/s]

Episode finished after 123 timesteps with reward=-276.7019209106659
Episode finished after 101 timesteps with reward=-329.4147309767234
Episode finished after 147 timesteps with reward=-327.1584193927499
iter=6650	epsilon=0.051
Current score(mean over 3) = -311.092


 64%|██████▍   | 6698/10500 [11:25<06:28,  9.77it/s]

Episode finished after 142 timesteps with reward=-302.18879967957616
Episode finished after 220 timesteps with reward=-134.8177132767758


 64%|██████▍   | 6702/10500 [11:26<06:28,  9.77it/s]

Episode finished after 217 timesteps with reward=-160.79333566623603
iter=6700	epsilon=0.051
Current score(mean over 3) = -199.267


 64%|██████▍   | 6749/10500 [11:30<06:23,  9.78it/s]

Episode finished after 125 timesteps with reward=-304.8850979793539
Episode finished after 111 timesteps with reward=-307.05813880196706
Episode finished after 92 timesteps with reward=-142.28104491610037
iter=6750	epsilon=0.051
Current score(mean over 3) = -251.408


 65%|██████▍   | 6799/10500 [11:35<06:18,  9.78it/s]

Episode finished after 162 timesteps with reward=-285.53271916685725
Episode finished after 104 timesteps with reward=-352.01483071826976
Episode finished after 132 timesteps with reward=-369.29212579315634
iter=6800	epsilon=0.051
Current score(mean over 3) = -335.613


 65%|██████▌   | 6849/10500 [11:40<06:13,  9.78it/s]

Episode finished after 94 timesteps with reward=-172.24380721156663
Episode finished after 100 timesteps with reward=-363.65675935197544
Episode finished after 89 timesteps with reward=-118.19229353854617
iter=6850	epsilon=0.051
Current score(mean over 3) = -218.031


 66%|██████▌   | 6899/10500 [11:44<06:07,  9.79it/s]

Episode finished after 123 timesteps with reward=-71.16922621014157
Episode finished after 147 timesteps with reward=-230.0375352806887


 66%|██████▌   | 6901/10500 [11:45<06:07,  9.78it/s]

Episode finished after 124 timesteps with reward=-333.24990752366375
iter=6900	epsilon=0.051
Current score(mean over 3) = -211.486


 66%|██████▌   | 6950/10500 [11:50<06:02,  9.79it/s]

Episode finished after 113 timesteps with reward=-313.3613512176788
Episode finished after 102 timesteps with reward=-265.7786713301165
Episode finished after 131 timesteps with reward=-363.8607808346924
iter=6950	epsilon=0.051
Current score(mean over 3) = -314.334


 67%|██████▋   | 7000/10500 [11:55<05:57,  9.78it/s]

Episode finished after 231 timesteps with reward=-23.417716677272225
Episode finished after 97 timesteps with reward=-221.1726201987594
Episode finished after 121 timesteps with reward=-264.44364570229214
iter=7000	epsilon=0.051
Current score(mean over 3) = -169.678


 67%|██████▋   | 7049/10500 [11:59<05:52,  9.79it/s]

Episode finished after 142 timesteps with reward=-454.5131702892624
Episode finished after 148 timesteps with reward=-361.9658864537512


 67%|██████▋   | 7051/10500 [12:00<05:52,  9.79it/s]

Episode finished after 158 timesteps with reward=-419.24905013629296
iter=7050	epsilon=0.051
Current score(mean over 3) = -411.909


 68%|██████▊   | 7099/10500 [12:04<05:47,  9.80it/s]

Episode finished after 150 timesteps with reward=-297.9133036668667
Episode finished after 252 timesteps with reward=-166.12291642603276
Episode finished after 177 timesteps with reward=-129.21682498549907
iter=7100	epsilon=0.051
Current score(mean over 3) = -197.751


 68%|██████▊   | 7149/10500 [12:09<05:42,  9.80it/s]

Episode finished after 117 timesteps with reward=-219.520177065359
Episode finished after 121 timesteps with reward=-272.1098180936824
Episode finished after 116 timesteps with reward=-260.6950605947957

 68%|██████▊   | 7151/10500 [12:10<05:42,  9.79it/s]


iter=7150	epsilon=0.051
Current score(mean over 3) = -250.775


 69%|██████▊   | 7199/10500 [12:14<05:36,  9.80it/s]

Episode finished after 135 timesteps with reward=-161.28487652266222
Episode finished after 129 timesteps with reward=-252.78249748704584
Episode finished after 97 timesteps with reward=-92.91783720015546
iter=7200	epsilon=0.051
Current score(mean over 3) = -168.995


 69%|██████▉   | 7248/10500 [12:19<05:31,  9.80it/s]

Episode finished after 144 timesteps with reward=-263.4404059508197
Episode finished after 170 timesteps with reward=-243.378679631122


 69%|██████▉   | 7251/10500 [12:20<05:31,  9.79it/s]

Episode finished after 159 timesteps with reward=-340.4413811197081
iter=7250	epsilon=0.051
Current score(mean over 3) = -282.420


 70%|██████▉   | 7299/10500 [12:24<05:26,  9.80it/s]

Episode finished after 153 timesteps with reward=-301.8337829043892
Episode finished after 146 timesteps with reward=-97.86838636267547


 70%|██████▉   | 7301/10500 [12:25<05:26,  9.80it/s]

Episode finished after 136 timesteps with reward=-328.08348471929486
iter=7300	epsilon=0.051
Current score(mean over 3) = -242.595


 70%|██████▉   | 7349/10500 [12:29<05:21,  9.80it/s]

Episode finished after 250 timesteps with reward=-277.5781478793142
Episode finished after 166 timesteps with reward=-253.0732502391263


 70%|███████   | 7351/10500 [12:30<05:21,  9.79it/s]

Episode finished after 101 timesteps with reward=-102.7538358998715
iter=7350	epsilon=0.051
Current score(mean over 3) = -211.135


 70%|███████   | 7398/10500 [12:34<05:16,  9.80it/s]

Episode finished after 205 timesteps with reward=-304.21201584398983
Episode finished after 187 timesteps with reward=-312.240412263104


 70%|███████   | 7402/10500 [12:35<05:16,  9.79it/s]

Episode finished after 242 timesteps with reward=-397.42514284231106
iter=7400	epsilon=0.051
Current score(mean over 3) = -337.959


 71%|███████   | 7449/10500 [12:40<05:11,  9.80it/s]

Episode finished after 165 timesteps with reward=-59.85956325668576
Episode finished after 255 timesteps with reward=-233.46863568359817
Episode finished after 175 timesteps with reward=-225.09249695795575
iter=7450	epsilon=0.051
Current score(mean over 3) = -172.807


 71%|███████▏  | 7498/10500 [12:45<05:06,  9.79it/s]

Episode finished after 200 timesteps with reward=-331.8568043893364


 71%|███████▏  | 7500/10500 [12:46<05:06,  9.78it/s]

Episode finished after 364 timesteps with reward=-358.8989772335699
Episode finished after 190 timesteps with reward=-177.41737636287024
iter=7500	epsilon=0.051
Current score(mean over 3) = -289.391


 72%|███████▏  | 7549/10500 [12:51<05:01,  9.78it/s]

Episode finished after 178 timesteps with reward=-342.23428433513766
Episode finished after 157 timesteps with reward=-250.92527710284912


 72%|███████▏  | 7552/10500 [12:52<05:01,  9.78it/s]

Episode finished after 174 timesteps with reward=-294.51114582783276
iter=7550	epsilon=0.050
Current score(mean over 3) = -295.890


 72%|███████▏  | 7599/10500 [12:57<04:56,  9.78it/s]

Episode finished after 195 timesteps with reward=-208.23130359740736
Episode finished after 146 timesteps with reward=-235.3706973017293


 72%|███████▏  | 7601/10500 [12:57<04:56,  9.77it/s]

Episode finished after 247 timesteps with reward=-275.2343315885502
iter=7600	epsilon=0.050
Current score(mean over 3) = -239.612


 73%|███████▎  | 7649/10500 [13:02<04:51,  9.77it/s]

Episode finished after 295 timesteps with reward=-379.63737826947977
Episode finished after 321 timesteps with reward=-370.88405780160235


 73%|███████▎  | 7651/10500 [13:03<04:51,  9.76it/s]

Episode finished after 247 timesteps with reward=-391.4304180233166
iter=7650	epsilon=0.050
Current score(mean over 3) = -380.651


 73%|███████▎  | 7699/10500 [13:08<04:46,  9.76it/s]

Episode finished after 279 timesteps with reward=-453.7604954497121
Episode finished after 215 timesteps with reward=-156.76430848739034


 73%|███████▎  | 7701/10500 [13:09<04:47,  9.75it/s]

Episode finished after 214 timesteps with reward=-151.39036364212575
iter=7700	epsilon=0.050
Current score(mean over 3) = -253.972


 74%|███████▍  | 7749/10500 [13:14<04:42,  9.75it/s]

Episode finished after 305 timesteps with reward=-253.1411016157693
Episode finished after 204 timesteps with reward=-323.02935977292253


 74%|███████▍  | 7752/10500 [13:16<04:42,  9.74it/s]

Episode finished after 268 timesteps with reward=-295.2682204295424
iter=7750	epsilon=0.050
Current score(mean over 3) = -290.480


 74%|███████▍  | 7799/10500 [13:20<04:37,  9.74it/s]

Episode finished after 353 timesteps with reward=-238.17569241966365
Episode finished after 255 timesteps with reward=-224.15951526523602
Episode finished after 155 timesteps with reward=-221.42928225147915
iter=7800	epsilon=0.050
Current score(mean over 3) = -227.921


 75%|███████▍  | 7848/10500 [13:26<04:32,  9.73it/s]

Episode finished after 252 timesteps with reward=-202.79828479404438
Episode finished after 165 timesteps with reward=-184.91321231401494


 75%|███████▍  | 7852/10500 [13:27<04:32,  9.72it/s]

Episode finished after 274 timesteps with reward=-140.12380218734526
iter=7850	epsilon=0.050
Current score(mean over 3) = -175.945


 75%|███████▌  | 7899/10500 [13:32<04:27,  9.72it/s]

Episode finished after 263 timesteps with reward=-65.3678331228866
Episode finished after 224 timesteps with reward=-168.36148636975457


 75%|███████▌  | 7902/10500 [13:33<04:27,  9.71it/s]

Episode finished after 285 timesteps with reward=-140.41950380420062
iter=7900	epsilon=0.050
Current score(mean over 3) = -124.716


 76%|███████▌  | 7949/10500 [13:38<04:22,  9.71it/s]

Episode finished after 333 timesteps with reward=-236.75994084879517


 76%|███████▌  | 7950/10500 [13:39<04:22,  9.70it/s]

Episode finished after 328 timesteps with reward=-135.58407580620315
Episode finished after 171 timesteps with reward=-178.73930573976696
iter=7950	epsilon=0.050
Current score(mean over 3) = -183.694


 76%|███████▌  | 7999/10500 [13:45<04:18,  9.69it/s]

Episode finished after 155 timesteps with reward=-163.81932562079547
Episode finished after 344 timesteps with reward=-78.40853735422692


 76%|███████▌  | 8001/10500 [13:46<04:18,  9.68it/s]

Episode finished after 324 timesteps with reward=-238.6116410359829
iter=8000	epsilon=0.050
Current score(mean over 3) = -160.280


 77%|███████▋  | 8048/10500 [13:51<04:13,  9.67it/s]

Episode finished after 603 timesteps with reward=-240.45947843338712


 77%|███████▋  | 8050/10500 [13:53<04:13,  9.66it/s]

Episode finished after 212 timesteps with reward=-181.38471098876823
Episode finished after 194 timesteps with reward=-197.75585772492857
iter=8050	epsilon=0.050
Current score(mean over 3) = -206.533


 77%|███████▋  | 8098/10500 [13:58<04:08,  9.65it/s]

Episode finished after 224 timesteps with reward=-165.74479869319273
Episode finished after 273 timesteps with reward=-264.6613542271898


 77%|███████▋  | 8101/10500 [14:00<04:08,  9.64it/s]

Episode finished after 412 timesteps with reward=-271.9342004510661
iter=8100	epsilon=0.050
Current score(mean over 3) = -234.113


 78%|███████▊  | 8149/10500 [14:05<04:04,  9.63it/s]

Episode finished after 498 timesteps with reward=-147.48886181305107
Episode finished after 171 timesteps with reward=-223.14262021368515


 78%|███████▊  | 8150/10500 [14:07<04:04,  9.62it/s]

Episode finished after 304 timesteps with reward=-49.25689469615898
iter=8150	epsilon=0.050
Current score(mean over 3) = -139.963


 78%|███████▊  | 8199/10500 [14:12<03:59,  9.61it/s]

Episode finished after 465 timesteps with reward=-206.19234172372182
Episode finished after 609 timesteps with reward=-199.88484399606722


 78%|███████▊  | 8201/10500 [14:15<03:59,  9.58it/s]

Episode finished after 391 timesteps with reward=-219.62331274855643
iter=8200	epsilon=0.050
Current score(mean over 3) = -208.567


 79%|███████▊  | 8249/10500 [14:21<03:55,  9.58it/s]

Episode finished after 311 timesteps with reward=-181.0850383536312
Episode finished after 781 timesteps with reward=-370.78480599188975


 79%|███████▊  | 8251/10500 [14:23<03:55,  9.55it/s]

Episode finished after 236 timesteps with reward=-253.3905005961519
iter=8250	epsilon=0.050
Current score(mean over 3) = -268.420


 79%|███████▉  | 8299/10500 [14:30<03:50,  9.54it/s]

Episode finished after 455 timesteps with reward=-359.11075407668744
Episode finished after 250 timesteps with reward=-270.41784529481515


 79%|███████▉  | 8300/10500 [14:31<03:51,  9.52it/s]

Episode finished after 294 timesteps with reward=-277.36412329835275
iter=8300	epsilon=0.050
Current score(mean over 3) = -302.298


 80%|███████▉  | 8349/10500 [14:38<03:46,  9.51it/s]

Episode finished after 889 timesteps with reward=-270.61374810479646
Episode finished after 161 timesteps with reward=-172.25768841477378


 80%|███████▉  | 8351/10500 [14:40<03:46,  9.49it/s]

Episode finished after 284 timesteps with reward=-308.6640496154283
iter=8350	epsilon=0.050
Current score(mean over 3) = -250.512


 80%|███████▉  | 8399/10500 [14:46<03:41,  9.48it/s]

Episode finished after 325 timesteps with reward=-298.0107066654481
Episode finished after 284 timesteps with reward=-223.1128424299703


 80%|████████  | 8401/10500 [14:47<03:41,  9.46it/s]

Episode finished after 510 timesteps with reward=-351.6381920906549
iter=8400	epsilon=0.050
Current score(mean over 3) = -290.921


 80%|████████  | 8449/10500 [14:53<03:36,  9.45it/s]

Episode finished after 658 timesteps with reward=-427.10565302638014
Episode finished after 295 timesteps with reward=-267.0901289176617


 80%|████████  | 8452/10500 [14:57<03:37,  9.42it/s]

Episode finished after 787 timesteps with reward=-436.7476523397041
iter=8450	epsilon=0.050
Current score(mean over 3) = -376.981


 81%|████████  | 8498/10500 [15:03<03:32,  9.41it/s]

Episode finished after 420 timesteps with reward=-326.76311668643825
Episode finished after 333 timesteps with reward=-288.01761056387977


 81%|████████  | 8501/10500 [15:04<03:32,  9.39it/s]

Episode finished after 234 timesteps with reward=-214.40058703518363
iter=8500	epsilon=0.050
Current score(mean over 3) = -276.394


 81%|████████▏ | 8549/10500 [15:11<03:27,  9.38it/s]

Episode finished after 415 timesteps with reward=-306.09260553860383
Episode finished after 918 timesteps with reward=-406.78689431562947


 81%|████████▏ | 8551/10500 [15:14<03:28,  9.35it/s]

Episode finished after 561 timesteps with reward=-258.9620790745363
iter=8550	epsilon=0.050
Current score(mean over 3) = -323.947


 82%|████████▏ | 8599/10500 [15:20<03:23,  9.34it/s]

Episode finished after 261 timesteps with reward=-224.0892100119124
Episode finished after 235 timesteps with reward=-203.1846498041956


 82%|████████▏ | 8601/10500 [15:21<03:23,  9.33it/s]

Episode finished after 240 timesteps with reward=-261.9871610167685
iter=8600	epsilon=0.050
Current score(mean over 3) = -229.754


 82%|████████▏ | 8649/10500 [15:27<03:18,  9.33it/s]

Episode finished after 201 timesteps with reward=-167.78667223215598
Episode finished after 201 timesteps with reward=-175.36368663591708


 82%|████████▏ | 8651/10500 [15:28<03:18,  9.32it/s]

Episode finished after 428 timesteps with reward=-234.97864483215767
iter=8650	epsilon=0.050
Current score(mean over 3) = -192.710


 83%|████████▎ | 8699/10500 [15:34<03:13,  9.31it/s]

Episode finished after 481 timesteps with reward=-322.63413679970176
Episode finished after 208 timesteps with reward=-194.18702753365955


 83%|████████▎ | 8702/10500 [15:35<03:13,  9.30it/s]

Episode finished after 344 timesteps with reward=-219.0555682068603
iter=8700	epsilon=0.050
Current score(mean over 3) = -245.292


 83%|████████▎ | 8748/10500 [15:41<03:08,  9.29it/s]

Episode finished after 210 timesteps with reward=-206.75411278050484
Episode finished after 281 timesteps with reward=-261.0621724859014


 83%|████████▎ | 8752/10500 [15:42<03:08,  9.28it/s]

Episode finished after 272 timesteps with reward=-314.9237813321183
iter=8750	epsilon=0.050
Current score(mean over 3) = -260.913


 84%|████████▍ | 8799/10500 [15:48<03:03,  9.28it/s]

Episode finished after 220 timesteps with reward=-221.09540949761447


 84%|████████▍ | 8800/10500 [15:49<03:03,  9.27it/s]

Episode finished after 294 timesteps with reward=-260.48954990117323
Episode finished after 198 timesteps with reward=-165.54675034635997
iter=8800	epsilon=0.050
Current score(mean over 3) = -215.711


 84%|████████▍ | 8849/10500 [15:55<02:58,  9.26it/s]

Episode finished after 237 timesteps with reward=-206.93779163431776
Episode finished after 435 timesteps with reward=-217.06746771717206


 84%|████████▍ | 8851/10500 [15:57<02:58,  9.24it/s]

Episode finished after 479 timesteps with reward=-280.26610357704806
iter=8850	epsilon=0.050
Current score(mean over 3) = -234.757


 85%|████████▍ | 8899/10500 [16:03<02:53,  9.23it/s]

Episode finished after 448 timesteps with reward=-267.9291142744629
Episode finished after 172 timesteps with reward=-176.7876917882205


 85%|████████▍ | 8900/10500 [16:04<02:53,  9.22it/s]

Episode finished after 307 timesteps with reward=-289.6976530797839
iter=8900	epsilon=0.050
Current score(mean over 3) = -244.805


 85%|████████▌ | 8948/10500 [16:10<02:48,  9.22it/s]

Episode finished after 663 timesteps with reward=-359.8328873850883
Episode finished after 1000 timesteps with reward=-197.91794610225247


 85%|████████▌ | 8952/10500 [16:15<02:48,  9.18it/s]

Episode finished after 375 timesteps with reward=-225.7128520023228
iter=8950	epsilon=0.050
Current score(mean over 3) = -261.155


 86%|████████▌ | 8999/10500 [16:20<02:43,  9.17it/s]

Episode finished after 141 timesteps with reward=-71.23070421898305
Episode finished after 218 timesteps with reward=-220.74706546289778


 86%|████████▌ | 9001/10500 [16:22<02:43,  9.16it/s]

Episode finished after 374 timesteps with reward=-236.97238260630513
iter=9000	epsilon=0.050
Current score(mean over 3) = -176.317


 86%|████████▌ | 9049/10500 [16:28<02:38,  9.16it/s]

Episode finished after 349 timesteps with reward=-238.9817084865524
Episode finished after 187 timesteps with reward=-161.70314831183524


 86%|████████▌ | 9051/10500 [16:29<02:38,  9.15it/s]

Episode finished after 211 timesteps with reward=-186.67189727886046
iter=9050	epsilon=0.050
Current score(mean over 3) = -195.786


 87%|████████▋ | 9098/10500 [16:34<02:33,  9.15it/s]

Episode finished after 1000 timesteps with reward=-139.41736823993497
Episode finished after 202 timesteps with reward=-202.46348713055227


 87%|████████▋ | 9102/10500 [16:38<02:33,  9.11it/s]

Episode finished after 245 timesteps with reward=-233.5609497909821
iter=9100	epsilon=0.050
Current score(mean over 3) = -191.814


 87%|████████▋ | 9149/10500 [16:43<02:28,  9.11it/s]

Episode finished after 404 timesteps with reward=-216.50350398746104
Episode finished after 183 timesteps with reward=-155.81754513729726


 87%|████████▋ | 9151/10500 [16:46<02:28,  9.09it/s]

Episode finished after 892 timesteps with reward=-327.4234627560027
iter=9150	epsilon=0.050
Current score(mean over 3) = -233.248


 88%|████████▊ | 9199/10500 [16:54<02:23,  9.07it/s]

Episode finished after 276 timesteps with reward=-252.53141381660623
Episode finished after 408 timesteps with reward=-235.23931897846128


 88%|████████▊ | 9202/10500 [16:56<02:23,  9.05it/s]

Episode finished after 508 timesteps with reward=-231.32903820188594
iter=9200	epsilon=0.050
Current score(mean over 3) = -239.700


 88%|████████▊ | 9249/10500 [17:01<02:18,  9.05it/s]

Episode finished after 249 timesteps with reward=-197.31808755897646
Episode finished after 1000 timesteps with reward=-191.31506146809878


 88%|████████▊ | 9251/10500 [17:05<02:18,  9.02it/s]

Episode finished after 350 timesteps with reward=-292.8558201318599
iter=9250	epsilon=0.050
Current score(mean over 3) = -227.163


 89%|████████▊ | 9298/10500 [17:12<02:13,  9.01it/s]

Episode finished after 209 timesteps with reward=-209.2649192130816


 89%|████████▊ | 9300/10500 [17:13<02:13,  9.00it/s]

Episode finished after 223 timesteps with reward=-226.80853385134407
Episode finished after 175 timesteps with reward=-222.46919381090646
iter=9300	epsilon=0.050
Current score(mean over 3) = -219.514


 89%|████████▉ | 9349/10500 [17:19<02:08,  8.99it/s]

Episode finished after 252 timesteps with reward=-240.13933221131248
Episode finished after 451 timesteps with reward=-256.1604209229224


 89%|████████▉ | 9352/10500 [17:21<02:07,  8.98it/s]

Episode finished after 242 timesteps with reward=-120.95524235234915
iter=9350	epsilon=0.050
Current score(mean over 3) = -205.752


 90%|████████▉ | 9399/10500 [17:26<02:02,  8.98it/s]

Episode finished after 527 timesteps with reward=-249.15939851275158


 90%|████████▉ | 9400/10500 [17:28<02:02,  8.97it/s]

Episode finished after 400 timesteps with reward=-220.6801065619891
Episode finished after 179 timesteps with reward=-193.36764485889154
iter=9400	epsilon=0.050
Current score(mean over 3) = -221.069


 90%|████████▉ | 9449/10500 [17:33<01:57,  8.97it/s]

Episode finished after 206 timesteps with reward=-207.68035707629878
Episode finished after 204 timesteps with reward=-197.35186974350836


 90%|█████████ | 9451/10500 [17:34<01:57,  8.96it/s]

Episode finished after 200 timesteps with reward=-173.15662869937103
iter=9450	epsilon=0.050
Current score(mean over 3) = -192.730


 90%|█████████ | 9498/10500 [17:40<01:51,  8.96it/s]

Episode finished after 289 timesteps with reward=-255.25298521788676
Episode finished after 221 timesteps with reward=-139.43013474615222


 90%|█████████ | 9501/10500 [17:42<01:51,  8.94it/s]

Episode finished after 906 timesteps with reward=-286.7139526707573
iter=9500	epsilon=0.050
Current score(mean over 3) = -227.132


 91%|█████████ | 9549/10500 [17:48<01:46,  8.94it/s]

Episode finished after 242 timesteps with reward=-224.29610199286313
Episode finished after 249 timesteps with reward=-236.65516326504766


 91%|█████████ | 9551/10500 [17:49<01:46,  8.93it/s]

Episode finished after 291 timesteps with reward=-241.03729549745736
iter=9550	epsilon=0.050
Current score(mean over 3) = -233.996


 91%|█████████▏| 9599/10500 [17:55<01:40,  8.93it/s]

Episode finished after 201 timesteps with reward=-188.7817804639035
Episode finished after 426 timesteps with reward=-301.26433824718697


 91%|█████████▏| 9602/10500 [17:56<01:40,  8.92it/s]

Episode finished after 361 timesteps with reward=-231.11210904907134
iter=9600	epsilon=0.050
Current score(mean over 3) = -240.386


 92%|█████████▏| 9649/10500 [18:02<01:35,  8.92it/s]

Episode finished after 369 timesteps with reward=-267.72634335256396
Episode finished after 853 timesteps with reward=-325.3026455485723


 92%|█████████▏| 9651/10500 [18:04<01:35,  8.90it/s]

Episode finished after 317 timesteps with reward=-243.43474127493536
iter=9650	epsilon=0.050
Current score(mean over 3) = -278.821


 92%|█████████▏| 9699/10500 [18:10<01:30,  8.89it/s]

Episode finished after 420 timesteps with reward=-257.1375785124547
Episode finished after 303 timesteps with reward=-221.5944969030149


 92%|█████████▏| 9701/10500 [18:12<01:29,  8.88it/s]

Episode finished after 231 timesteps with reward=-201.59136283157926
iter=9700	epsilon=0.050
Current score(mean over 3) = -226.774


 93%|█████████▎| 9748/10500 [18:17<01:24,  8.88it/s]

Episode finished after 1000 timesteps with reward=-145.47853361280056
Episode finished after 272 timesteps with reward=-248.1097969502387


 93%|█████████▎| 9751/10500 [18:21<01:24,  8.85it/s]

Episode finished after 497 timesteps with reward=-285.6406026978185
iter=9750	epsilon=0.050
Current score(mean over 3) = -226.410


 93%|█████████▎| 9799/10500 [18:26<01:19,  8.86it/s]

Episode finished after 243 timesteps with reward=-111.40171339202206
Episode finished after 279 timesteps with reward=-160.904481874682
Episode finished after 172 timesteps with reward=-169.08422423719338
iter=9800	epsilon=0.050
Current score(mean over 3) = -147.130


 94%|█████████▍| 9848/10500 [18:32<01:13,  8.85it/s]

Episode finished after 718 timesteps with reward=-272.92550474767324
Episode finished after 295 timesteps with reward=-169.95533695643417


 94%|█████████▍| 9852/10500 [18:36<01:13,  8.82it/s]

Episode finished after 944 timesteps with reward=-314.9408241807514
iter=9850	epsilon=0.050
Current score(mean over 3) = -252.607


 94%|█████████▍| 9899/10500 [18:41<01:08,  8.83it/s]

Episode finished after 270 timesteps with reward=-227.44374176213458
Episode finished after 229 timesteps with reward=-230.0650971201195


 94%|█████████▍| 9901/10500 [18:42<01:07,  8.82it/s]

Episode finished after 288 timesteps with reward=-227.18816336362772
iter=9900	epsilon=0.050
Current score(mean over 3) = -228.232


 95%|█████████▍| 9949/10500 [18:48<01:02,  8.82it/s]

Episode finished after 441 timesteps with reward=-255.75615935743025
Episode finished after 247 timesteps with reward=-196.60234598107672


 95%|█████████▍| 9951/10500 [18:49<01:02,  8.81it/s]

Episode finished after 229 timesteps with reward=-200.79805024639398
iter=9950	epsilon=0.050
Current score(mean over 3) = -217.719


 95%|█████████▌| 9999/10500 [18:55<00:56,  8.81it/s]

Episode finished after 258 timesteps with reward=-134.6407125974494
Episode finished after 322 timesteps with reward=-200.94015346478136


 95%|█████████▌| 10002/10500 [18:56<00:56,  8.80it/s]

Episode finished after 294 timesteps with reward=-244.82736329149475
iter=10000	epsilon=0.050
Current score(mean over 3) = -193.469


 96%|█████████▌| 10048/10500 [19:01<00:51,  8.80it/s]

Episode finished after 687 timesteps with reward=-293.37430145856706
Episode finished after 656 timesteps with reward=-291.92196819835897


 96%|█████████▌| 10052/10500 [19:06<00:51,  8.77it/s]

Episode finished after 765 timesteps with reward=-300.763489285
iter=10050	epsilon=0.050
Current score(mean over 3) = -295.353


 96%|█████████▌| 10098/10500 [19:12<00:45,  8.76it/s]

Episode finished after 364 timesteps with reward=-270.718473554354
Episode finished after 349 timesteps with reward=-232.8027476333894


 96%|█████████▌| 10101/10500 [19:14<00:45,  8.75it/s]

Episode finished after 475 timesteps with reward=-281.6264551634356
iter=10100	epsilon=0.050
Current score(mean over 3) = -261.716


 97%|█████████▋| 10149/10500 [19:21<00:40,  8.74it/s]

Episode finished after 456 timesteps with reward=-261.1438733869768
Episode finished after 779 timesteps with reward=-303.94039062195554


 97%|█████████▋| 10150/10500 [19:24<00:40,  8.72it/s]

Episode finished after 442 timesteps with reward=-244.3832839817826
iter=10150	epsilon=0.050
Current score(mean over 3) = -269.823


 97%|█████████▋| 10199/10500 [19:30<00:34,  8.71it/s]

Episode finished after 520 timesteps with reward=-236.38816038873486
Episode finished after 243 timesteps with reward=-195.86067802399964


 97%|█████████▋| 10201/10500 [19:33<00:34,  8.70it/s]

Episode finished after 526 timesteps with reward=-268.4823707373922
iter=10200	epsilon=0.050
Current score(mean over 3) = -233.577


 98%|█████████▊| 10249/10500 [19:39<00:28,  8.69it/s]

Episode finished after 505 timesteps with reward=-315.5243767119666
Episode finished after 1000 timesteps with reward=-184.6453412563946


 98%|█████████▊| 10251/10500 [19:44<00:28,  8.65it/s]

Episode finished after 604 timesteps with reward=-274.09800737692046
iter=10250	epsilon=0.050
Current score(mean over 3) = -258.089


 98%|█████████▊| 10299/10500 [19:50<00:23,  8.65it/s]

Episode finished after 977 timesteps with reward=-346.4629430117999
Episode finished after 235 timesteps with reward=-165.2613239188026


 98%|█████████▊| 10302/10500 [19:55<00:22,  8.62it/s]

Episode finished after 671 timesteps with reward=-292.7066907168563
iter=10300	epsilon=0.050
Current score(mean over 3) = -268.144


 99%|█████████▊| 10349/10500 [20:02<00:17,  8.61it/s]

Episode finished after 164 timesteps with reward=-151.36739229717264
Episode finished after 305 timesteps with reward=-188.2871158060284


 99%|█████████▊| 10352/10500 [20:04<00:17,  8.59it/s]

Episode finished after 921 timesteps with reward=-292.7147566346384
iter=10350	epsilon=0.050
Current score(mean over 3) = -210.790


 99%|█████████▉| 10399/10500 [20:10<00:11,  8.59it/s]

Episode finished after 527 timesteps with reward=-244.35548682970736
Episode finished after 975 timesteps with reward=-294.7326441161448


 99%|█████████▉| 10401/10500 [20:14<00:11,  8.56it/s]

Episode finished after 358 timesteps with reward=-202.24487022965857
iter=10400	epsilon=0.050
Current score(mean over 3) = -247.111


100%|█████████▉| 10449/10500 [20:21<00:05,  8.55it/s]

Episode finished after 306 timesteps with reward=-158.7883213352794
Episode finished after 573 timesteps with reward=-279.34108338344436


100%|█████████▉| 10452/10500 [20:25<00:05,  8.53it/s]

Episode finished after 976 timesteps with reward=-400.878857921913
iter=10450	epsilon=0.050
Current score(mean over 3) = -279.669


100%|█████████▉| 10499/10500 [20:33<00:00,  8.51it/s]

Episode finished after 312 timesteps with reward=-292.5437785368547
Episode finished after 447 timesteps with reward=-190.57506065380454


100%|██████████| 10500/10500 [20:34<00:00,  8.50it/s]

Episode finished after 392 timesteps with reward=-222.733239235747
iter=10500	epsilon=0.050
Current score(mean over 3) = -235.284





In [None]:
from pandas import ewma
iters,session_rewards=zip(*sorted(rewards.items(),key=lambda (k,v):k))
plt.plot(iters,ewma(np.array(session_rewards),span=10))

In [None]:
final_reward = pool.evaluate(n_games=10,save_path="./records",record_video=True)

print("average reward:",final_reward)

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./records/")))

for video_name in video_names:
    HTML("""
    <video width="640" height="480" controls>
      <source src="{}" type="video/mp4">
    </video>
    """.format("./records/"+video_name)) #this may or may not be _last_ video. Try other indices