# DQN

In [232]:
# import package needed
%matplotlib inline
import matplotlib.pyplot as plt
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import skimage.color
import skimage.transform
from ple.games.flappybird import FlappyBird
from ple import PLE
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game

In [233]:
# define input size
screen_width = 80
screen_height = 80
num_stack = 4

In [234]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 10e-4
state = game.getGameState()
class Agent:

  def __init__(self, name, num_action, t=0, discount_factor=0.99):
    self.exploring_rate = 0.1
    self.discount_factor = discount_factor
    self.num_action = num_action
    self.name = name
    self.bucket_range_per_feature = bucket_range_per_feature
    with tf.variable_scope(name):
        
      self.build_model()
    
  def get_state_idx(self, state):
    # instead of using absolute position of pipe, use relative position
    state = copy.deepcopy(state)
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']

    # sort to make list converted from dict ordered in alphabet order
    state_key = [k for k, v in sorted(state.items())]

    # do bucketing to decrease state space to speed up training
    state_idx = []
    for key in state_key:
        state_idx.append(state[key] / self.bucket_range_per_feature[key])
    return tuple(state_idx)

  def build_model(self):
    # input: current screen, selected action and reward
    #self.input_screen = tf.placeholder(tf.float32, shape=[None, screen_width, screen_height, num_stack])
    self.input_state = tf.placeholder(tf.float32, shape=[None, 8, num_stack])
    self.action = tf.placeholder(tf.int32, [None])
    self.reward = tf.placeholder(tf.float32, [None])
    self.is_training = tf.placeholder(tf.bool, shape=[])

    def net(state, reuse=False):
      with tf.variable_scope("fc", initializer=tf.truncated_normal_initializer(stddev=1e-2)):
        for idx in range(num_stack):
          if idx == 0:
            pre_dense = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=reuse)
            pre_dense = tf.reshape(pre_dense, [-1, 8, 1])
          else:
            pre_stack = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=True)
            pre_stack = tf.reshape(pre_stack, [-1, 8, 1])
            pre_dense = tf.concat([pre_dense, pre_stack], 2)
                
      with tf.variable_scope("layers", reuse=reuse, initializer=tf.truncated_normal_initializer(stddev=1e-2)):
        state_in = tf.reshape(pre_dense, [-1, screen_width, screen_height, num_stack]) 
        conv1 = tf.layers.conv2d(inputs=state_in, filters=32, kernel_size=[8, 8], 
                                 strides=[4, 4], padding='SAME', activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')
        conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[4, 4],
                                 strides=[2, 2], padding='SAME', activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(inputs=conv2, filters=64, kernel_size=[3, 3],
                                 strides=[1, 1], padding='SAME', activation=tf.nn.relu)
        flat = tf.contrib.layers.flatten(conv3)
        dense = tf.layers.dense(inputs=flat, units=512, activation=tf.nn.relu)
        Q = tf.layers.dense(inputs=dense, units=self.num_action, activation=None)
        return Q

    # optimize
    self.output = net(self.input_state)  # Q(s,a,theta) for all a, shape (batch_size, num_action)
    index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
    self.esti_Q = tf.gather_nd(self.output,index)  # Q(s,a,theta) for selected action, shape (batch_size, 1)
    self.max_Q = tf.reduce_max(self.output, axis=1)  # max(Q(s',a',theta')), shape (batch_size, 1)
    self.tar_Q = tf.placeholder(tf.float32, [None])
    # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
    self.loss = tf.reduce_mean(tf.square(self.reward + self.discount_factor * self.tar_Q - self.esti_Q))

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-5)
    self.g_gvs = optimizer.compute_gradients(self.loss, var_list=[v for v in tf.global_variables() if self.name in v.name])
    self.train_op = optimizer.apply_gradients(self.g_gvs)
    self.pred = tf.argmax(self.output, axis=1)  # select action with highest action-value, only used in inference

  def select_action(self, input_state, sess):
    # epsilon-greedy
    if np.random.rand() < self.exploring_rate:
      action = np.random.choice(num_action)  # Select a random action
    else:
      input_states = np.array(input_state).transpose([1, 0])
      feed_dict = {
          self.input_state: input_states[None, :],
          self.is_training: False,
      }
      action = sess.run(
          self.pred,
          feed_dict=feed_dict)[0]  # Select the action with the highest q
    return action

  def update_policy(self, input_state, actions, rewards, input_screens_plum,
                    terminal, target_netwrok):
    # use max_Q estimate from target one to update online one
    feed_dict = {
        target_netwrok.input_state:
            np.array(input_screens_plum).transpose([0, 2, 1]),
        target_netwrok.is_training:
            True,
    }
    max_Q = sess.run(target_netwrok.max_Q, feed_dict=feed_dict)
    max_Q *= ~np.array(terminal)
    feed_dict = {
        self.input_state: np.array(input_state).transpose([0, 2, 1]),
        self.tar_Q: max_Q,
        self.action: actions,
        self.reward: rewards,
        self.is_training: True,
    }
    loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
    return loss

  def update_parameters(self, episode):
    if self.exploring_rate > MIN_EXPLORING_RATE:
      self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

  def shutdown_explore(self):
    # make action selection greedy
    self.exploring_rate = 0


In [235]:
def get_update_ops():
  # return operations assign weight to target network
  src_vars = [v for v in tf.global_variables() if 'online' in v.name]
  tar_vars = [v for v in tf.global_variables() if 'target' in v.name]
  update_ops = []
  for src_var, tar_var in zip(src_vars, tar_vars):
    update_ops.append(tar_var.assign(src_var))
  return update_ops


def update_target(update_ops, sess):
  sess.run(update_ops)

In [236]:
# init agent
tf.reset_default_graph()
num_action = len(env.getActionSet())
bucket_range_per_feature = {
  'next_next_pipe_bottom_y': 40,
  'next_next_pipe_dist_to_player': 512,
  'next_next_pipe_top_y': 40,
  'next_pipe_bottom_y': 20,
  'next_pipe_dist_to_player': 20,
  'next_pipe_top_y': 20,
  'player_vel': 4,
  'player_y': 16
}
# agent for frequently updating
online_agent = Agent('online', num_action, bucket_range_per_feature)

# agent for slow updating
target_agent = Agent('target', num_action, bucket_range_per_feature)
update_ops = get_update_ops()

In [237]:
class Replay_buffer():

  def __init__(self, buffer_size=50000):
    self.experiences = []
    self.buffer_size = buffer_size

  def add(self, experience):
    if len(self.experiences) >= self.buffer_size:
      self.experiences.pop(0)
    self.experiences.append(experience)

  def sample(self, size):
    """
        sameple experience from buffer
        """
    if size > len(self.experiences):
      experiences_idx = np.random.choice(len(self.experiences), size=size)
    else:
      experiences_idx = np.random.choice(
          len(self.experiences), size=size, replace=False)
    # from all sampled experiences, extract a tuple of (s,a,r,s')
    states = []
    actions = []
    rewards = []
    screens_plum = []
    terminal = []
    for i in range(size):
      states.append(self.experiences[experiences_idx[i]][0])
      actions.append(self.experiences[experiences_idx[i]][1])
      rewards.append(self.experiences[experiences_idx[i]][2])
      screens_plum.append(self.experiences[experiences_idx[i]][3])
      terminal.append(self.experiences[experiences_idx[i]][4])
    return states, actions, rewards, screens_plum, terminal

In [238]:
# init buffer
buffer = Replay_buffer()

In [239]:
def make_anim(images, fps=60, true_image=False):
  duration = len(images) / fps
  import moviepy.editor as mpy

  def make_frame(t):
    try:
      x = images[int(len(images) / duration * t)]
    except:
      x = images[-1]

    if true_image:
      return x.astype(np.uint8)
    else:
      return ((x + 1) / 2 * 255).astype(np.uint8)

  clip = mpy.VideoClip(make_frame, duration=duration)
  clip.fps = fps
  return clip

In [240]:
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [244]:
from IPython.display import Image, display

update_every_t_step = 3
print_every_episode = 10
save_video_every_episode = 1000
NUM_EPISODE = 30000
NUM_EXPLORE = 20

# we can redefine origin reward function
reward_values = {
    "positive": 1,  # reward pass a pipe
    "tick": 0.1,  # reward per timestamp
    "loss": -1,  # reward of gameover
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
  game = FlappyBird()
  # for demo purpose, the following code is trained in the same scene,
  env = PLE(
      game,
      fps=30,
      display_screen=False,
      reward_values=reward_values,
      rng=np.random.RandomState(1))
  env.reset_game()
  env.act(0)  # dummy input to make sure input screen is correct

  # record frame
  if episode % save_video_every_episode == 0:
    frames = [env.getScreenRGB()]

  # for every 500 episodes, shutdown exploration to see performance of greedy action
  if episode % print_every_episode == 0:
    online_agent.shutdown_explore()

  # grayscale input screen for this episode
  #input_screens = [preprocess(env.getScreenGrayscale())] * 4
  input_state = [online_agent.get_state_idx(game.getGameState())] * 4
    
  # experience for this episode, store all (s,a,r,s') tuple
  experience = []

  # cumulate reward for this episode
  cum_reward = 0

  t = 0
  while not env.game_over():
    #print(np.array(input_state).shape)
    # feed four previous screen, select an action
    action = online_agent.select_action(input_state[-4:], sess)

    # execute the action and get reward
    reward = env.act(env.getActionSet()[action])

    # record frame
    if episode % save_video_every_episode == 0:
      frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # append grayscale screen for this episode
    #input_screens.append(preprocess(env.getScreenGrayscale()))
    input_state.append(online_agent.get_state_idx(game.getGameState()))
    # append experience for this episode
    buffer.add((input_state[-5:-1], action, reward, input_state[-4:],
                env.game_over()))
    t += 1

    # update agent
  if episode > NUM_EXPLORE:
    train_states, train_actions, train_rewards, train_states_plum, terminal = buffer.sample(
        32)
    loss = online_agent.update_policy(train_states, train_actions,
                                      train_rewards, train_states_plum,
                                      terminal, target_agent)
  if t % update_every_t_step == 0 and episode > NUM_EXPLORE:
    update_target(update_ops, sess)

  # update explore rating and learning rate
  online_agent.update_parameters(episode)
  target_agent.update_parameters(episode)

  if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
    print(
        "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, loss: {}".
        format(episode, t, cum_reward, target_agent.exploring_rate, loss))

  if episode % save_video_every_episode == 0:  # for every 100 episode, record an animation
    clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip.write_videofile("movie/DQN-{}.webm".format(episode), fps=60)

[MoviePy] >>>> Building video movie/DQN-0.webm
[MoviePy] Writing video movie/DQN-0.webm


100%|██████████| 63/63 [00:00<00:00, 89.73it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-0.webm 






[30] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0998999440000106, loss: 0.0014970132615417242
[40] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09989961400001063, loss: 0.05540362000465393
[50] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09989928400001066, loss: 0.027302715927362442
[60] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0998989540000107, loss: 0.0279642753303051
[70] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09989862400001073, loss: 0.055154576897621155
[80] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09989829400001077, loss: 0.0004940094659104943
[90] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0998979640000108, loss: 0.00044269702630117536
[100] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09989763400001084, loss: 0.027507835999131203
[110] time live:61, cumulated reward: 5.0

[710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987750400001297, loss: 0.021696938201785088
[720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099877174000013, loss: 0.000530610210262239
[730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987684400001304, loss: 0.0005528137553483248
[740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987651400001307, loss: 0.02068259008228779
[750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987618400001311, loss: 0.0009218280902132392
[760] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987585400001314, loss: 0.04127369076013565
[770] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987552400001318, loss: 0.08157572150230408
[780] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987519400001321, loss: 0.04060322046279907
[790] time live:61, cumulated reward

100%|██████████| 63/63 [00:00<00:00, 100.54it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-1000.webm 






[1010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986760400001402, loss: 0.014769770205020905
[1020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986727400001405, loss: 0.0020178803242743015
[1030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986694400001409, loss: 0.01271409448236227
[1040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986661400001412, loss: 0.0010119931539520621
[1050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986628400001415, loss: 0.0033694179728627205
[1060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986595400001419, loss: 0.0014905157731845975
[1070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986562400001422, loss: 0.0022176355123519897
[1080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986529400001426, loss: 0.0007907754043117166
[1090] time live:61

[1680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984549400001635, loss: 0.00018012085638474673
[1690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984516400001639, loss: 0.0002484449069015682
[1700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984483400001642, loss: 0.0020857262425124645
[1710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984450400001646, loss: 0.0012518414296209812
[1720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0998441740000165, loss: 0.0026367492973804474
[1730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984384400001653, loss: 0.00046790033229626715
[1740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984351400001656, loss: 0.00023456223425455391
[1750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0998431840000166, loss: 0.0003148185205645859
[1760] time liv

100%|██████████| 63/63 [00:00<00:00, 88.62it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-2000.webm 






[2010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983460400001751, loss: 0.00010880485933739692
[2020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983427400001754, loss: 9.916868293657899e-05
[2030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983394400001758, loss: 0.00021295403712429106
[2040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983361400001761, loss: 6.335727812256664e-05
[2050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983328400001765, loss: 0.0010229393374174833
[2060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983295400001768, loss: 0.0004961332306265831
[2070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983262400001772, loss: 0.00011413671018090099
[2080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983229400001775, loss: 0.0028015412390232086
[2090] time l

[2680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981249400001985, loss: 7.817221194272861e-05
[2690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981216400001988, loss: 0.00011691299732774496
[2700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981183400001992, loss: 0.0011484521673992276
[2710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981150400001995, loss: 9.775212674867362e-05
[2720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981117400001999, loss: 7.41952535463497e-05
[2730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981084400002002, loss: 2.799350841087289e-05
[2740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981051400002006, loss: 0.00010092278535012156
[2750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981018400002009, loss: 3.740761530934833e-05
[2760] time liv

100%|██████████| 63/63 [00:00<00:00, 103.52it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-3000.webm 






[3010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099801604000021, loss: 1.8251015717396513e-05
[3020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09980127400002103, loss: 5.2830655477009714e-05
[3030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09980094400002107, loss: 9.271307499147952e-05
[3040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0998006140000211, loss: 0.0001387523952871561
[3050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09980028400002114, loss: 4.162804543739185e-05
[3060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09979995400002117, loss: 0.00015389952750410885
[3070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09979962400002121, loss: 4.305974653107114e-05
[3080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09979929400002124, loss: 8.180116128642112e-05
[3090] time live

[3680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977949400002334, loss: 8.247554433182813e-06
[3690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977916400002337, loss: 5.109660378366243e-06
[3700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977883400002341, loss: 6.887895324325655e-06
[3710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977850400002344, loss: 5.2301024879852775e-06
[3720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977817400002348, loss: 6.010312972648535e-06
[3730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977784400002351, loss: 8.993875781015959e-06
[3740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977751400002355, loss: 3.4684487673075637e-06
[3750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977718400002358, loss: 0.000138249815790914
[3760] time liv

100%|██████████| 63/63 [00:00<00:00, 100.29it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-4000.webm 






[4010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976860400002449, loss: 0.00014506543811876327
[4020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976827400002453, loss: 3.853076486848295e-05
[4030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976794400002456, loss: 5.968345431028865e-05
[4040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0997676140000246, loss: 0.0030972552485764027
[4050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976728400002463, loss: 0.001587308943271637
[4060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976695400002467, loss: 0.0007097387569956481
[4070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0997666240000247, loss: 0.0006183425430208445
[4080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976629400002474, loss: 0.0003250133013352752
[4090] time live:6

[4680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974649400002683, loss: 0.0021003237925469875
[4690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974616400002687, loss: 0.004000289365649223
[4700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0997458340000269, loss: 0.00045196490827947855
[4710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974550400002694, loss: 0.0038433740846812725
[4720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974517400002697, loss: 0.0011068183230236173
[4730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099744844000027, loss: 0.004140446428209543
[4740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974451400002704, loss: 0.004354111384600401
[4750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974418400002708, loss: 0.0032204152084887028
[4760] time live:61, 

100%|██████████| 63/63 [00:00<00:00, 88.74it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-5000.webm 






[5010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973560400002798, loss: 0.0038273048121482134
[5020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973527400002802, loss: 0.0010684231529012322
[5030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973494400002805, loss: 0.0005781054496765137
[5040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973461400002809, loss: 0.0005394779145717621
[5050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973428400002812, loss: 0.0003303741686977446
[5060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973395400002816, loss: 0.0032685219775885344
[5070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973362400002819, loss: 0.0032637910917401314
[5080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973329400002823, loss: 0.0003176906902808696
[5090] time live

[5680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971349400003032, loss: 0.0004092780000064522
[5690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971316400003036, loss: 0.003051354316994548
[5700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971283400003039, loss: 0.00031791318906471133
[5710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971250400003043, loss: 0.0021653645671904087
[5720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971217400003046, loss: 0.0020047365687787533
[5730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0997118440000305, loss: 0.0024583337362855673
[5740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971151400003053, loss: 0.0042220354080200195
[5750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971118400003057, loss: 0.0012650011340156198
[5760] time live:

100%|██████████| 63/63 [00:00<00:00, 99.88it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-6000.webm 






[6010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970260400003148, loss: 0.0009745702263899148
[6020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970227400003151, loss: 0.0005350529681891203
[6030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970194400003154, loss: 0.00260781473480165
[6040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970161400003158, loss: 0.0018573000561445951
[6050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970128400003161, loss: 0.0005448235897347331
[6060] time live:57, cumulated reward: 4.699999999999996, exploring rate: 0.09970095400003165, loss: 0.0029456017073243856
[6070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970062400003168, loss: 0.0026586949825286865
[6080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970029400003172, loss: 0.0046918257139623165
[6090] time live:6

[6680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09968049400003381, loss: 0.0005289405817165971
[6690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09968016400003385, loss: 0.0018477627309039235
[6700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09967983400003388, loss: 0.0004811867547687143
[6710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09967950400003392, loss: 0.0002805071708280593
[6720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09967917400003395, loss: 0.0898323580622673
[6730] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.09967884400003399, loss: 0.0012626282405108213
[6740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09967851400003402, loss: 0.001812349772080779
[6750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09967818400003406, loss: 0.000733880908228457
[6760] time live:61, 

100%|██████████| 63/63 [00:00<00:00, 101.36it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-7000.webm 






[7010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966960400003497, loss: 0.08060082793235779
[7020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099669274000035, loss: 0.0017808160046115518
[7030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966894400003504, loss: 0.0013011321425437927
[7040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966861400003507, loss: 0.0005484741413965821
[7050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0996682840000351, loss: 0.001539937686175108
[7060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966795400003514, loss: 0.0018568772356957197
[7070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966762400003518, loss: 0.0005989829660393298
[7080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966729400003521, loss: 0.0024314960464835167
[7090] time live:59, c

[7680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0996474940000373, loss: 0.0023197298869490623
[7690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09964716400003734, loss: 0.005660065449774265
[7700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09964683400003738, loss: 0.002030750038102269
[7710] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09964650400003741, loss: 0.0009347492014057934
[7720] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09964617400003745, loss: 0.004117172211408615
[7730] time live:77, cumulated reward: 7.699999999999989, exploring rate: 0.09964584400003748, loss: 0.003461644286289811
[7740] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.09964551400003752, loss: 0.0006158800679259002
[7750] time live:77, cumulated reward: 7.699999999999989, exploring rate: 0.09964518400003755, loss: 0.001891981577500701
[7760] time live:61, cu

100%|██████████| 63/63 [00:00<00:00, 94.99it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-8000.webm 

[8010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963660400003846, loss: 0.0017616325058043003
[8020] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.0996362740000385, loss: 0.004248049110174179
[8030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963594400003853, loss: 0.000893118791282177
[8040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963561400003856, loss: 0.0014359313063323498
[8050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0996352840000386, loss: 0.0026450147852301598
[8060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963495400003863, loss: 0.0018307799473404884
[8070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963462400003867, loss: 0.06885968148708344
[8080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099

[8680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0996144940000408, loss: 0.04192488268017769
[8690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961416400004083, loss: 0.0838233008980751
[8700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961383400004087, loss: 0.002018436323851347
[8710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0996135040000409, loss: 0.03833160176873207
[8720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961317400004094, loss: 0.0013711220817640424
[8730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961284400004097, loss: 0.002694424707442522
[8740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961251400004101, loss: 0.03727913275361061
[8750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961218400004104, loss: 0.0745513066649437
[8760] time live:62, cumulated r

100%|██████████| 63/63 [00:00<00:00, 97.40it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-9000.webm 






[9010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960360400004195, loss: 0.0010254679946228862
[9020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960327400004199, loss: 0.021427759900689125
[9030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960294400004202, loss: 0.0009730717865750194
[9040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960261400004206, loss: 0.0007805897039361298
[9050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960228400004209, loss: 0.001354325097054243
[9060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960195400004213, loss: 0.0023411845322698355
[9070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960162400004216, loss: 0.003447979688644409
[9080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0996012940000422, loss: 0.0018874923698604107
[9090] time live:61,

[9690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09958116400004433, loss: 0.004055578727275133
[9700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09958083400004436, loss: 0.0019283888395875692
[9710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995805040000444, loss: 0.0004456789174582809
[9720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09958017400004443, loss: 0.009000182151794434
[9730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09957984400004447, loss: 0.0011673083063215017
[9740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995795140000445, loss: 0.0023265592753887177
[9750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09957918400004453, loss: 0.0008006065618246794
[9760] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09957885400004457, loss: 0.0007547904970124364
[9770] time live:61,

100%|██████████| 63/63 [00:00<00:00, 108.54it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-10000.webm 






[10010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09957060400004544, loss: 0.003203370375558734
[10020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09957027400004548, loss: 0.013392630033195019
[10030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09956994400004551, loss: 0.0010971175506711006
[10040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09956961400004555, loss: 0.004451838321983814
[10050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09956928400004558, loss: 0.000731209060177207
[10060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09956895400004562, loss: 0.0010244878940284252
[10070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09956862400004565, loss: 0.00037226476706564426
[10080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09956829400004569, loss: 0.0032190189231187105
[10090] tim

[10680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954849400004778, loss: 0.0010165529092773795
[10690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954816400004782, loss: 0.0001269989152206108
[10700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954783400004785, loss: 0.00020196600235067308
[10710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954750400004789, loss: 0.0001927208504639566
[10720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954717400004792, loss: 0.0012605688534677029
[10730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954684400004796, loss: 0.0003130377735942602
[10740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954651400004799, loss: 2.9439699574140832e-05
[10750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09954618400004803, loss: 0.00031469581881538033
[1076

 99%|█████████▊| 68/69 [00:00<00:00, 113.45it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-11000.webm 






[11010] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09953760400004893, loss: 0.001559027354232967
[11020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09953727400004897, loss: 0.00026526517467573285
[11030] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.099536944000049, loss: 0.0009130770340561867
[11040] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09953661400004904, loss: 0.000272328092250973
[11050] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.09953628400004907, loss: 0.00021607844973914325
[11060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09953595400004911, loss: 0.0001879448100226
[11070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09953562400004914, loss: 0.0005645417259074748
[11080] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.09953529400004918, loss: 0.00028065979131497443
[11090] time

[11680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951549400005127, loss: 0.0014653339749202132
[11690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951516400005131, loss: 0.0011168455239385366
[11700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951483400005134, loss: 0.0005340576171875
[11710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951450400005138, loss: 0.0009175066370517015
[11720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951417400005141, loss: 0.0007997670909389853
[11730] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.09951384400005145, loss: 0.0007645590230822563
[11740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951351400005148, loss: 0.0059937103651463985
[11750] time live:67, cumulated reward: 6.699999999999992, exploring rate: 0.09951318400005152, loss: 0.0005380334332585335
[11760] tim

100%|██████████| 63/63 [00:00<00:00, 107.36it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-12000.webm 






[12010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09950460400005243, loss: 0.0003149860422126949
[12020] time live:90, cumulated reward: 8.999999999999984, exploring rate: 0.09950427400005246, loss: 0.013171217404305935
[12030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995039440000525, loss: 0.0009143953793682158
[12040] time live:90, cumulated reward: 8.999999999999984, exploring rate: 0.09950361400005253, loss: 0.0003354927757754922
[12050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09950328400005257, loss: 0.0011636712588369846
[12060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995029540000526, loss: 0.0013584597036242485
[12070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09950262400005264, loss: 0.000997732626274228
[12080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09950229400005267, loss: 0.015345599502325058
[12090] time 

[12680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09948249400005477, loss: 0.026447564363479614
[12690] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.0994821640000548, loss: 0.07492679357528687
[12700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09948183400005484, loss: 0.04282497614622116
[12710] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.09948150400005487, loss: 0.026643304154276848
[12720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0994811740000549, loss: 0.0006280823145061731
[12730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09948084400005494, loss: 0.0005233426345512271
[12740] time live:67, cumulated reward: 6.699999999999992, exploring rate: 0.09948051400005498, loss: 0.0007721915026195347
[12750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09948018400005501, loss: 0.0010238183895125985
[12760] time liv

100%|██████████| 63/63 [00:00<00:00, 95.58it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-13000.webm 






[13010] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09947160400005592, loss: 0.03392968326807022
[13020] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09947127400005595, loss: 0.001609241939149797
[13030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09947094400005599, loss: 0.0020305707585066557
[13040] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09947061400005602, loss: 0.0011741917114704847
[13050] time live:90, cumulated reward: 8.999999999999984, exploring rate: 0.09947028400005606, loss: 0.09519288688898087
[13060] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.0994699540000561, loss: 0.0013633331982418895
[13070] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09946962400005613, loss: 0.03354058414697647
[13080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09946929400005616, loss: 0.0002861526154447347
[13090] time liv

[13680] time live:90, cumulated reward: 8.999999999999984, exploring rate: 0.09944949400005826, loss: 0.010768676176667213
[13690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0994491640000583, loss: 0.01955900527536869
[13700] time live:90, cumulated reward: 8.999999999999984, exploring rate: 0.09944883400005833, loss: 0.039950959384441376
[13710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09944850400005836, loss: 0.0012585732620209455
[13720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0994481740000584, loss: 0.002444857731461525
[13730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09944784400005843, loss: 0.018858667463064194
[13740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09944751400005847, loss: 0.0004804245545528829
[13750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0994471840000585, loss: 0.05068645998835564
[13760] time live:6

 98%|█████████▊| 63/64 [00:00<00:00, 112.06it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-14000.webm 






[14010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09943860400005941, loss: 0.0009685647673904896
[14020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09943827400005945, loss: 0.0012439822312444448
[14030] time live:67, cumulated reward: 6.699999999999992, exploring rate: 0.09943794400005948, loss: 0.058996837586164474
[14040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09943761400005952, loss: 0.009976968169212341
[14050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09943728400005955, loss: 0.017508238554000854
[14060] time live:67, cumulated reward: 6.699999999999992, exploring rate: 0.09943695400005959, loss: 0.001970976358279586
[14070] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09943662400005962, loss: 0.06124459207057953
[14080] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09943629400005966, loss: 0.0011887861182913184
[14090] time l

[14680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941649400006175, loss: 0.0016119801439344883
[14690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941616400006179, loss: 0.0036043867003172636
[14700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941583400006182, loss: 0.006522048264741898
[14710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941550400006186, loss: 0.018754858523607254
[14720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941517400006189, loss: 0.0014786106767132878
[14730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941484400006192, loss: 0.0012929990189149976
[14740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09941451400006196, loss: 0.008309765718877316
[14750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099414184000062, loss: 0.0008296614978462458
[14760] time 

 98%|█████████▊| 61/62 [00:00<00:00, 111.17it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-15000.webm 






[15010] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.0994056040000629, loss: 0.0018512008246034384
[15020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09940527400006294, loss: 0.0016213860362768173
[15030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09940494400006297, loss: 0.002039876999333501
[15040] time live:65, cumulated reward: 6.499999999999993, exploring rate: 0.09940461400006301, loss: 0.0006408224580809474
[15050] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.09940428400006304, loss: 0.002002231078222394
[15060] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.09940395400006308, loss: 0.04547456279397011
[15070] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.09940362400006311, loss: 0.049032002687454224
[15080] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09940329400006315, loss: 0.0012635956518352032
[15090] time l

[15680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938349400006524, loss: 0.0021555391140282154
[15690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938316400006528, loss: 0.0026914821937680244
[15700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938283400006531, loss: 0.0033318938221782446
[15710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938250400006535, loss: 0.003233330324292183
[15720] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.09938217400006538, loss: 0.0009761503897607327
[15730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938184400006542, loss: 0.0015173107385635376
[15740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938151400006545, loss: 0.0037520327605307102
[15750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09938118400006549, loss: 0.0015613993164151907
[15760] t

100%|██████████| 63/63 [00:00<00:00, 105.84it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-16000.webm 






[16010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0993726040000664, loss: 0.001829273533076048
[16020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09937227400006643, loss: 0.0012896081898361444
[16030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09937194400006646, loss: 0.03757194057106972
[16040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0993716140000665, loss: 0.009861343540251255
[16050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09937128400006653, loss: 0.033079080283641815
[16060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09937095400006657, loss: 0.002354515017941594
[16070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0993706240000666, loss: 0.02720487304031849
[16080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09937029400006664, loss: 0.0013378658331930637
[16090] time live:6

[16680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09935049400006873, loss: 0.0015745381824672222
[16690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09935016400006877, loss: 0.012391871772706509
[16700] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.0993498340000688, loss: 0.03792620822787285
[16710] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09934950400006884, loss: 0.005372580140829086
[16720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09934917400006887, loss: 0.006368624046444893
[16730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09934884400006891, loss: 0.004409546032547951
[16740] time live:59, cumulated reward: 4.899999999999995, exploring rate: 0.09934851400006894, loss: 0.029386956244707108
[16750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09934818400006898, loss: 0.012587244622409344
[16760] time live

100%|██████████| 63/63 [00:00<00:00, 96.38it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-17000.webm 






[17010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09933960400006989, loss: 0.00249613169580698
[17020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09933927400006992, loss: 0.002342076040804386
[17030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09933894400006996, loss: 0.0016621649265289307
[17040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09933861400006999, loss: 0.0029437574557960033
[17050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09933828400007003, loss: 0.001474442658945918
[17060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09933795400007006, loss: 0.002841008361428976
[17070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0993376240000701, loss: 0.06912916898727417
[17080] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09933729400007013, loss: 0.04010830074548721
[17090] time live:

[17680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09931749400007223, loss: 0.00593099370598793
[17690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09931716400007226, loss: 0.009171550162136555
[17700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0993168340000723, loss: 0.012061551213264465
[17710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09931650400007233, loss: 0.001877735136076808
[17720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09931617400007237, loss: 0.005117814987897873
[17730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0993158440000724, loss: 0.010452467948198318
[17740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09931551400007244, loss: 0.00914289802312851
[17750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09931518400007247, loss: 0.005667052231729031
[17760] time live:61

100%|██████████| 63/63 [00:00<00:00, 99.44it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-18000.webm 






[18010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09930660400007338, loss: 0.004295618273317814
[18020] time live:58, cumulated reward: 4.799999999999995, exploring rate: 0.09930627400007341, loss: 0.007656892761588097
[18030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09930594400007345, loss: 0.006427725311368704
[18040] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09930561400007348, loss: 0.0059121339581906796
[18050] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09930528400007352, loss: 0.013629525899887085
[18060] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09930495400007355, loss: 0.0021265801042318344
[18070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09930462400007359, loss: 0.016370339319109917
[18080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09930429400007362, loss: 0.011026439256966114
[18090] time li

[18680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928449400007572, loss: 0.0104037681594491
[18690] time live:63, cumulated reward: 5.299999999999994, exploring rate: 0.09928416400007575, loss: 0.0010382734471932054
[18700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928383400007579, loss: 0.0032369722612202168
[18710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928350400007582, loss: 0.005760741885751486
[18720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928317400007586, loss: 0.0014146468602120876
[18730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928284400007589, loss: 0.0017915036296471953
[18740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928251400007593, loss: 0.0023149827029556036
[18750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09928218400007596, loss: 0.0064899008721113205
[18760] time

100%|██████████| 63/63 [00:00<00:00, 91.97it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-19000.webm 






[19010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09927360400007687, loss: 0.0031617607455700636
[19020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0992732740000769, loss: 0.0025594329927116632
[19030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09927294400007694, loss: 0.0015680200885981321
[19040] time live:68, cumulated reward: 6.799999999999992, exploring rate: 0.09927261400007698, loss: 0.0018524568295106292
[19050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09927228400007701, loss: 0.0011446925345808268
[19060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09927195400007705, loss: 0.0009711220045574009
[19070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09927162400007708, loss: 0.009985614567995071
[19080] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09927129400007711, loss: 0.005884348414838314
[19090] tim

[19680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09925149400007921, loss: 0.015127642080187798
[19690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09925116400007924, loss: 0.0490594282746315
[19700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09925083400007928, loss: 0.005983351264148951
[19710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09925050400007931, loss: 0.0952351838350296
[19720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09925017400007935, loss: 0.02750002220273018
[19730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09924984400007938, loss: 0.002084900625050068
[19740] time live:67, cumulated reward: 6.699999999999992, exploring rate: 0.09924951400007942, loss: 0.06405012309551239
[19750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09924918400007945, loss: 0.015024781227111816
[19760] time live:61, 

 98%|█████████▊| 65/66 [00:00<00:00, 112.03it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-20000.webm 






[20010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09924060400008036, loss: 0.0022233533672988415
[20020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0992402740000804, loss: 0.004301213193684816
[20030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09923994400008043, loss: 0.03041742742061615
[20040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09923961400008047, loss: 0.021988816559314728
[20050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0992392840000805, loss: 0.006239915266633034
[20060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09923895400008054, loss: 0.03007211536169052
[20070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09923862400008057, loss: 0.0026259697042405605
[20080] time live:62, cumulated reward: 5.199999999999994, exploring rate: 0.0992382940000806, loss: 0.009522642008960247
[20090] time live:6

[20680] time live:91, cumulated reward: 9.099999999999984, exploring rate: 0.0992184940000827, loss: 0.0009262498351745307
[20690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09921816400008274, loss: 0.004814844578504562
[20700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09921783400008277, loss: 0.012314867228269577
[20710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0992175040000828, loss: 0.01503769401460886
[20720] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09921717400008284, loss: 0.006688022054731846
[20730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09921684400008288, loss: 0.003885571612045169
[20740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09921651400008291, loss: 0.02311951294541359
[20750] time live:95, cumulated reward: 9.499999999999982, exploring rate: 0.09921618400008295, loss: 0.00339326960965991
[20760] time live:61,

100%|██████████| 63/63 [00:00<00:00, 92.00it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-21000.webm 






[21010] time live:58, cumulated reward: 4.799999999999995, exploring rate: 0.09920760400008385, loss: 0.038194745779037476
[21020] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.09920727400008389, loss: 0.010313399136066437
[21030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09920694400008392, loss: 0.0821063369512558
[21040] time live:60, cumulated reward: 4.999999999999995, exploring rate: 0.09920661400008396, loss: 0.012312056496739388
[21050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099206284000084, loss: 0.008876355364918709
[21060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09920595400008403, loss: 0.07520066946744919
[21070] time live:58, cumulated reward: 4.799999999999995, exploring rate: 0.09920562400008406, loss: 0.015450725331902504
[21080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0992052940000841, loss: 0.04151618480682373
[21090] time live:61, c

[21680] time live:90, cumulated reward: 8.999999999999984, exploring rate: 0.0991854940000862, loss: 0.003062676638364792
[21690] time live:91, cumulated reward: 9.099999999999984, exploring rate: 0.09918516400008623, loss: 0.028577420860528946
[21700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09918483400008626, loss: 0.003942892421036959
[21710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0991845040000863, loss: 0.004004941787570715
[21720] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09918417400008633, loss: 0.018386729061603546
[21730] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09918384400008637, loss: 0.006203336641192436
[21740] time live:96, cumulated reward: 9.599999999999982, exploring rate: 0.0991835140000864, loss: 0.0027343768160790205
[21750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09918318400008644, loss: 0.022047659382224083
[21760] time live:

 99%|█████████▉| 95/96 [00:00<00:00, 105.75it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-22000.webm 






[22010] time live:68, cumulated reward: 6.799999999999992, exploring rate: 0.09917460400008735, loss: 0.033975064754486084
[22020] time live:95, cumulated reward: 9.499999999999982, exploring rate: 0.09917427400008738, loss: 0.03455711156129837
[22030] time live:107, cumulated reward: 11.699999999999978, exploring rate: 0.09917394400008742, loss: 0.0056881834752857685
[22040] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09917361400008745, loss: 0.012986168265342712
[22050] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09917328400008749, loss: 0.008455944247543812
[22060] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09917295400008752, loss: 0.008650298230350018
[22070] time live:106, cumulated reward: 11.599999999999978, exploring rate: 0.09917262400008756, loss: 0.009511567652225494
[22080] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09917229400008759, loss: 0.05288274586200714
[22090] time 

[22680] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09915249400008969, loss: 0.054219335317611694
[22690] time live:68, cumulated reward: 6.799999999999992, exploring rate: 0.09915216400008972, loss: 0.02375788241624832
[22700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09915183400008976, loss: 0.1132025420665741
[22710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09915150400008979, loss: 0.029832279309630394
[22720] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09915117400008983, loss: 0.07690316438674927
[22730] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09915084400008986, loss: 0.10453430563211441
[22740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0991505140000899, loss: 0.10968461632728577
[22750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09915018400008993, loss: 0.04816262423992157
[22760] time live:61, cu

100%|██████████| 63/63 [00:00<00:00, 96.70it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-23000.webm 






[23010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09914160400009084, loss: 0.14805951714515686
[23020] time live:63, cumulated reward: 5.299999999999994, exploring rate: 0.09914127400009087, loss: 0.10047242045402527
[23030] time live:69, cumulated reward: 6.8999999999999915, exploring rate: 0.09914094400009091, loss: 0.037090450525283813
[23040] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09914061400009094, loss: 0.05065689980983734
[23050] time live:66, cumulated reward: 6.5999999999999925, exploring rate: 0.09914028400009098, loss: 0.02727380394935608
[23060] time live:68, cumulated reward: 6.799999999999992, exploring rate: 0.09913995400009101, loss: 0.0718454048037529
[23070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09913962400009105, loss: 0.047372959554195404
[23080] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.09913929400009108, loss: 0.171494722366333
[23090] time live:69, cu

[23680] time live:101, cumulated reward: 11.09999999999998, exploring rate: 0.09911949400009318, loss: 0.08685144037008286
[23690] time live:98, cumulated reward: 9.799999999999981, exploring rate: 0.09911916400009321, loss: 0.03423793613910675
[23700] time live:98, cumulated reward: 9.799999999999981, exploring rate: 0.09911883400009325, loss: 0.6440918445587158
[23710] time live:99, cumulated reward: 9.89999999999998, exploring rate: 0.09911850400009328, loss: 0.07670635730028152
[23720] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09911817400009332, loss: 0.031830206513404846
[23730] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09911784400009335, loss: 0.1966005265712738
[23740] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.09911751400009339, loss: 0.015447143465280533
[23750] time live:68, cumulated reward: 6.799999999999992, exploring rate: 0.09911718400009342, loss: 0.02330741658806801
[23760] time live:69, cum

100%|██████████| 63/63 [00:00<00:00, 97.32it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-24000.webm 






[24010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09910860400009433, loss: 0.021384771913290024
[24020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09910827400009437, loss: 0.08374041318893433
[24030] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0991079440000944, loss: 0.04200952500104904
[24040] time live:63, cumulated reward: 5.299999999999994, exploring rate: 0.09910761400009444, loss: 0.02733982354402542
[24050] time live:63, cumulated reward: 5.299999999999994, exploring rate: 0.09910728400009447, loss: 0.16932567954063416
[24060] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.0991069540000945, loss: 0.02264340966939926
[24070] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.09910662400009454, loss: 0.018841583281755447
[24080] time live:72, cumulated reward: 7.19999999999999, exploring rate: 0.09910629400009457, loss: 0.021996378898620605
[24090] time live:76, cu

[24680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09908649400009667, loss: 0.0059876032173633575
[24690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0990861640000967, loss: 0.05055394768714905
[24700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09908583400009674, loss: 0.5998615026473999
[24710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09908550400009677, loss: 0.03829944133758545
[24720] time live:67, cumulated reward: 6.699999999999992, exploring rate: 0.09908517400009681, loss: 0.6213468313217163
[24730] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.09908484400009684, loss: 0.465131938457489
[24740] time live:77, cumulated reward: 7.699999999999989, exploring rate: 0.09908451400009688, loss: 0.07663708180189133
[24750] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09908418400009691, loss: 0.07734546810388565
[24760] time live:61, cumu

 99%|█████████▉| 108/109 [00:00<00:00, 115.51it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-25000.webm 






[25010] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09907560400009782, loss: 0.04163457825779915
[25020] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09907527400009786, loss: 0.060703471302986145
[25030] time live:108, cumulated reward: 11.799999999999978, exploring rate: 0.09907494400009789, loss: 0.046133432537317276
[25040] time live:106, cumulated reward: 11.599999999999978, exploring rate: 0.09907461400009793, loss: 0.06732064485549927
[25050] time live:134, cumulated reward: 14.399999999999968, exploring rate: 0.09907428400009796, loss: 0.3518289625644684
[25060] time live:77, cumulated reward: 7.699999999999989, exploring rate: 0.099073954000098, loss: 0.019376792013645172
[25070] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09907362400009803, loss: 0.0640028789639473
[25080] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.09907329400009807, loss: 0.24472109973430634
[25090] time live

[25680] time live:62, cumulated reward: 5.199999999999994, exploring rate: 0.09905349400010016, loss: 0.25185272097587585
[25690] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0990531640001002, loss: 0.03329048305749893
[25700] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09905283400010023, loss: 0.05347372218966484
[25710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09905250400010027, loss: 0.031465619802474976
[25720] time live:76, cumulated reward: 7.599999999999989, exploring rate: 0.0990521740001003, loss: 0.36066585779190063
[25730] time live:107, cumulated reward: 11.699999999999978, exploring rate: 0.09905184400010034, loss: 0.022963842377066612
[25740] time live:110, cumulated reward: 11.999999999999977, exploring rate: 0.09905151400010037, loss: 0.09239652752876282
[25750] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.0990511840001004, loss: 0.24841096997261047
[25760] time live:1

100%|██████████| 63/63 [00:00<00:00, 103.24it/s]

[MoviePy] Done.





[MoviePy] >>>> Video ready: movie/DQN-26000.webm 

[26010] time live:108, cumulated reward: 11.799999999999978, exploring rate: 0.09904260400010131, loss: 0.02962658926844597
[26020] time live:142, cumulated reward: 16.199999999999974, exploring rate: 0.09904227400010135, loss: 0.0235802810639143
[26030] time live:140, cumulated reward: 15.999999999999972, exploring rate: 0.09904194400010138, loss: 0.02509259432554245
[26040] time live:140, cumulated reward: 15.999999999999972, exploring rate: 0.09904161400010142, loss: 0.06316260248422623
[26050] time live:138, cumulated reward: 15.799999999999969, exploring rate: 0.09904128400010145, loss: 0.04154828563332558
[26060] time live:140, cumulated reward: 15.999999999999972, exploring rate: 0.09904095400010149, loss: 0.017614176496863365
[26070] time live:112, cumulated reward: 12.199999999999976, exploring rate: 0.09904062400010152, loss: 0.02182164415717125
[26080] time live:183, cumulated reward: 21.300000000000033, exploring rate: 0.09

[26680] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.09902049400010365, loss: 0.04984809830784798
[26690] time live:135, cumulated reward: 14.499999999999968, exploring rate: 0.09902016400010369, loss: 0.01912693679332733
[26700] time live:137, cumulated reward: 15.699999999999969, exploring rate: 0.09901983400010372, loss: 0.018334846943616867
[26710] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.09901950400010376, loss: 0.1722627431154251
[26720] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.0990191740001038, loss: 0.20175997912883759
[26730] time live:113, cumulated reward: 12.299999999999976, exploring rate: 0.09901884400010383, loss: 0.09345750510692596
[26740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09901851400010386, loss: 0.02169700339436531
[26750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0990181840001039, loss: 0.14521370828151703
[26760] time l

100%|██████████| 63/63 [00:00<00:00, 106.96it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-27000.webm 






[27010] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0990096040001048, loss: 0.02026353031396866
[27020] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09900927400010484, loss: 0.025800921022892
[27030] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09900894400010488, loss: 0.029761046171188354
[27040] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09900861400010491, loss: 0.02240387536585331
[27050] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09900828400010495, loss: 0.03263629600405693
[27060] time live:143, cumulated reward: 16.299999999999976, exploring rate: 0.09900795400010498, loss: 1.3573462963104248
[27070] time live:146, cumulated reward: 16.59999999999998, exploring rate: 0.09900762400010502, loss: 0.03777194023132324
[27080] time live:188, cumulated reward: 21.80000000000004, exploring rate: 0.09900729400010505, loss: 0.02867252007126808
[27090] time live:97, c

[27680] time live:184, cumulated reward: 21.400000000000034, exploring rate: 0.09898749400010715, loss: 0.050165947526693344
[27690] time live:141, cumulated reward: 16.099999999999973, exploring rate: 0.09898716400010718, loss: 0.04060719534754753
[27700] time live:77, cumulated reward: 7.699999999999989, exploring rate: 0.09898683400010722, loss: 0.028245095163583755
[27710] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09898650400010725, loss: 0.1373964250087738
[27720] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09898617400010729, loss: 0.023557694628834724
[27730] time live:68, cumulated reward: 6.799999999999992, exploring rate: 0.09898584400010732, loss: 0.033844538033008575
[27740] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09898551400010736, loss: 0.054501503705978394
[27750] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09898518400010739, loss: 0.05818507820367813
[27760] time live:

 99%|█████████▉| 134/135 [00:01<00:00, 105.67it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-28000.webm 






[28010] time live:109, cumulated reward: 11.899999999999977, exploring rate: 0.0989766040001083, loss: 0.33137544989585876
[28020] time live:76, cumulated reward: 7.599999999999989, exploring rate: 0.09897627400010833, loss: 0.07311725616455078
[28030] time live:137, cumulated reward: 15.699999999999969, exploring rate: 0.09897594400010837, loss: 0.04859202727675438
[28040] time live:145, cumulated reward: 16.49999999999998, exploring rate: 0.0989756140001084, loss: 0.026594962924718857
[28050] time live:260, cumulated reward: 31.000000000000142, exploring rate: 0.09897528400010844, loss: 0.10877969115972519
[28060] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09897495400010847, loss: 0.057037532329559326
[28070] time live:259, cumulated reward: 30.90000000000014, exploring rate: 0.09897462400010851, loss: 0.1788313090801239
[28080] time live:185, cumulated reward: 21.500000000000036, exploring rate: 0.09897429400010854, loss: 0.11189711093902588
[28090] time li

[28680] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09895449400011064, loss: 0.26329725980758667
[28690] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09895416400011067, loss: 0.11914896219968796
[28700] time live:72, cumulated reward: 7.19999999999999, exploring rate: 0.09895383400011071, loss: 0.05788159742951393
[28710] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09895350400011074, loss: 0.07099562883377075
[28720] time live:71, cumulated reward: 7.099999999999991, exploring rate: 0.09895317400011078, loss: 0.10443515330553055
[28730] time live:70, cumulated reward: 6.999999999999991, exploring rate: 0.09895284400011081, loss: 0.0551333986222744
[28740] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.09895251400011085, loss: 0.05595637857913971
[28750] time live:133, cumulated reward: 14.299999999999969, exploring rate: 0.09895218400011088, loss: 0.21063868701457977
[28760] time live:140, c

 98%|█████████▊| 65/66 [00:00<00:00, 114.38it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-29000.webm 






[29010] time live:64, cumulated reward: 5.399999999999993, exploring rate: 0.09894360400011179, loss: 0.19456252455711365
[29020] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09894327400011182, loss: 0.047723181545734406
[29030] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.09894294400011186, loss: 0.1134653314948082
[29040] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.0989426140001119, loss: 0.07905898988246918
[29050] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09894228400011193, loss: 0.04939939081668854
[29060] time live:74, cumulated reward: 7.39999999999999, exploring rate: 0.09894195400011196, loss: 0.06536512821912766
[29070] time live:66, cumulated reward: 6.5999999999999925, exploring rate: 0.098941624000112, loss: 0.06542250514030457
[29080] time live:97, cumulated reward: 9.699999999999982, exploring rate: 0.09894129400011203, loss: 0.02556007355451584
[29090] time live:61, cumulated

[29680] time live:184, cumulated reward: 21.400000000000034, exploring rate: 0.09892149400011413, loss: 0.11246661096811295
[29690] time live:140, cumulated reward: 15.999999999999972, exploring rate: 0.09892116400011416, loss: 0.1630907505750656
[29700] time live:73, cumulated reward: 7.29999999999999, exploring rate: 0.0989208340001142, loss: 0.03218374773859978
[29710] time live:72, cumulated reward: 7.19999999999999, exploring rate: 0.09892050400011423, loss: 0.048642948269844055
[29720] time live:75, cumulated reward: 7.499999999999989, exploring rate: 0.09892017400011427, loss: 0.2433781921863556
[29730] time live:138, cumulated reward: 15.799999999999969, exploring rate: 0.0989198440001143, loss: 0.131490096449852
[29740] time live:136, cumulated reward: 14.599999999999968, exploring rate: 0.09891951400011434, loss: 0.028123706579208374
[29750] time live:134, cumulated reward: 14.399999999999968, exploring rate: 0.09891918400011437, loss: 0.07230013608932495
[29760] time live:13

100%|██████████| 140/140 [00:01<00:00, 90.79it/s] 

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-30000.webm 






#### 最好的結果

In [247]:
from moviepy.editor import *
clip = VideoFileClip("movie/DQN-30000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|██████████| 140/140 [00:00<00:00, 644.19it/s]


#### 進入第一個障礙物

In [280]:
from moviepy.editor import *
clip = VideoFileClip("movie/DQN-11000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|██████████| 68/68 [00:00<00:00, 1048.46it/s]


## Policy Gradient

In [262]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.1


class Policy_Gradiebt_Agent:

  def __init__(self, name, num_action, t=0, discount_factor=0.99):
    self.discount_factor = discount_factor
    self.num_action = num_action
    self.name = name
    self.bucket_range_per_feature = bucket_range_per_feature
    with tf.variable_scope(name):
      self.build_model()
    
  def get_state_idx(self, state):
    # instead of using absolute position of pipe, use relative position
    state = copy.deepcopy(state)
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']

    # sort to make list converted from dict ordered in alphabet order
    state_key = [k for k, v in sorted(state.items())]

    # do bucketing to decrease state space to speed up training
    state_idx = []
    for key in state_key:
        state_idx.append(state[key] / self.bucket_range_per_feature[key])
    return tuple(state_idx)

  def build_model(self):
    # input: current screen, selected action and reward
    self.input_state = tf.placeholder(
        tf.float32, shape=[None, 8, num_stack])
    self.action = tf.placeholder(tf.int32, [None])
    self.reward = tf.placeholder(tf.float32, [None])
    self.is_training = tf.placeholder(tf.bool, shape=[])

    def net(state, reuse=False):
      with tf.variable_scope("fc"):
        for idx in range(num_stack):
          if idx == 0:
            pre_dense = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=reuse)
            pre_dense = tf.reshape(pre_dense, [-1, 8, 1])
          else:
            pre_stack = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=True)
            pre_stack = tf.reshape(pre_stack, [-1, 8, 1])
            pre_dense = tf.concat([pre_dense, pre_stack], 2)
      with tf.variable_scope("layers", reuse=reuse):
        state_in = tf.reshape(pre_dense, [-1, screen_width, screen_height, num_stack]) 
        conv1 = tf.layers.conv2d(
            inputs=state_in,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        self.flat = tf.contrib.layers.flatten(conv3)

        self.dense1 = tf.layers.dense(
            inputs=self.flat, units=512, activation=tf.nn.relu)
        self.dense2 = tf.layers.dense(
            inputs=self.dense1, units=self.num_action, activation=None)
        return self.dense2

    # optimize
    self.output_logit = net(
        self.input_state
    )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
    index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
    self.prob = tf.gather_nd(
        tf.nn.softmax(self.output_logit),
        index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

    # loss = E[log(p(s,a))*r]
    # because we want to maximize objective, add negative sign before loss
    self.loss = -tf.reduce_mean(tf.log(self.prob + 0.00000001) * self.reward)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
    g_gvs = optimizer.compute_gradients(
        self.loss,
        var_list=[v for v in tf.global_variables() if self.name in v.name])
    self.train_op = optimizer.apply_gradients(g_gvs)

    self.pred = tf.multinomial(self.output_logit,
                               1)  # sample action from distribution

  def select_action(self, input_state, sess):
    input_states = np.array(input_state).transpose([1, 0])
    feed_dict = {
        self.input_state: input_states[None, :],
        self.is_training: False,
    }
    action = sess.run(
        self.pred,
        feed_dict=feed_dict)[0][0]  # sameple action from distribution
    return action

  def update_policy(self, input_state, actions, rewards, input_states_plum):
    feed_dict = {
        self.input_state: np.array(input_state).transpose([0, 2, 1]),
        self.action: actions,
        self.reward: rewards,
        self.is_training: True,
    }
    loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
    return loss

In [263]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
pg_agent = Policy_Gradiebt_Agent('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [264]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 1000
NUM_EPISODE = 30000
NUM_EXPLORE = 10
NUM_PASS = 20
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
  game = FlappyBird()
  env = PLE(
      game,
      fps=30,
      display_screen=False,
      reward_values=reward_values,
      rng=np.random.RandomState(1))
  env.reset_game()
  env.act(0)  # dummy input to make sure input screen is correct

  # record frame
  if episode % save_video_every_episode == 0:
    frames = [env.getScreenRGB()]

  # grayscale input screen for this episode
  #input_screens = [preprocess(env.getScreenGrayscale())] * 4
  input_state = [pg_agent.get_state_idx(game.getGameState())] * 4
  # cumulate reward for this episode
  cum_reward = 0

  experiences = []
  t = 0
  while not env.game_over():
    # feed four previous screen, select an action
    action = pg_agent.select_action(input_state[-4:], sess)

    # execute the action and get reward
    reward = env.act(env.getActionSet()[action])

    # record frame
    if episode % save_video_every_episode == 0:
      frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # append grayscale screen for this episode
    #input_screens.append(preprocess(env.getScreenGrayscale()))
    input_state.append(pg_agent.get_state_idx(game.getGameState()))
    # append experience for this episode
    experiences.append(
        [input_state[-5:-1], action, reward, input_state[-4:]])

    t += 1

  def discount_reward(x, discount_rate):
    discounted_r = np.zeros(len(x))
    num_r = len(x)
    for i in range(num_r):
      discounted_r[i] = x[i] * math.pow(discount_rate, i)
    discounted_r = np.cumsum(discounted_r[::-1])
    return discounted_r[::-1]

  rewards = [e[2] for e in experiences]
  discounted_reward = discount_reward(rewards, pg_agent.discount_factor)

  # normalize
  discounted_reward -= np.mean(discounted_reward)
  discounted_reward /= np.std(discounted_reward)
  train_states = []
  train_actions = []
  train_rewards = []
  train_input_states_plum = []
  for i in range(len(experiences)):
    experiences[i][2] = discounted_reward[i]
    train_states.append(experiences[i][0])
    train_actions.append(experiences[i][1])
    train_rewards.append(experiences[i][2])
    train_input_states_plum.append(experiences[i][3])
  loss = pg_agent.update_policy(train_states, train_actions, train_rewards,
                                train_input_states_plum)

  if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
    print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
        episode, t, cum_reward, loss))

  if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
    clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip.write_videofile("movie/pg_{}.webm".format(episode), fps=60)
    #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[20] time live:61, cumulated reward: 5.099999999999994, loss: -0.0017613895470276475
[30] time live:61, cumulated reward: 5.099999999999994, loss: 0.0004440057382453233
[40] time live:61, cumulated reward: 5.099999999999994, loss: -0.0012428720947355032
[50] time live:48, cumulated reward: 3.799999999999999, loss: -0.0006696780328638852
[60] time live:46, cumulated reward: 3.5999999999999996, loss: 0.0015977568691596389
[70] time live:41, cumulated reward: 3.1000000000000014, loss: 0.0010510653955861926
[80] time live:53, cumulated reward: 4.299999999999997, loss: 0.0008013383485376835
[90] time live:61, cumulated reward: 5.099999999999994, loss: -5.5000429711071774e-05
[100] time live:61, cumulated reward: 5.099999999999994, loss: -0.0002705933584365994
[110] time live:49, cumulated reward: 3.8999999999999986, loss: 0.00026915024500340223
[120] time live:52, cumulated reward: 4.1999999999999975, loss: 0.0005383308161981404
[130] time live:60, cumulated reward: 4.999999999999995, loss:

[990] time live:54, cumulated reward: 4.399999999999997, loss: 0.001070128520950675
[1000] time live:61, cumulated reward: 5.099999999999994, loss: -0.002358639845624566
[MoviePy] >>>> Building video movie/pg_1000.webm
[MoviePy] Writing video movie/pg_1000.webm


100%|██████████| 63/63 [00:00<00:00, 97.18it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1000.webm 






[1010] time live:42, cumulated reward: 3.200000000000001, loss: -0.0043884688057005405
[1020] time live:49, cumulated reward: 3.8999999999999986, loss: 0.0018799061654135585
[1030] time live:55, cumulated reward: 4.4999999999999964, loss: -0.0032919971272349358
[1040] time live:47, cumulated reward: 3.6999999999999993, loss: -0.0013284480664879084
[1050] time live:47, cumulated reward: 3.6999999999999993, loss: 0.00038370172842405736
[1060] time live:61, cumulated reward: 5.099999999999994, loss: -0.0013746042968705297
[1070] time live:56, cumulated reward: 4.599999999999996, loss: 0.0004116807540412992
[1080] time live:61, cumulated reward: 5.099999999999994, loss: 0.0016664286376908422
[1090] time live:61, cumulated reward: 5.099999999999994, loss: -0.0012039747089147568
[1100] time live:46, cumulated reward: 3.5999999999999996, loss: 0.004214680753648281
[1110] time live:62, cumulated reward: 5.199999999999994, loss: -0.0015704554971307516
[1120] time live:48, cumulated reward: 3.79

[1970] time live:53, cumulated reward: 4.299999999999997, loss: -0.000691107998136431
[1980] time live:61, cumulated reward: 5.099999999999994, loss: -0.0007641276461072266
[1990] time live:50, cumulated reward: 3.9999999999999982, loss: -0.0003900718584191054
[2000] time live:69, cumulated reward: 6.8999999999999915, loss: -0.0027096166741102934
[MoviePy] >>>> Building video movie/pg_2000.webm
[MoviePy] Writing video movie/pg_2000.webm


 99%|█████████▊| 70/71 [00:00<00:00, 94.09it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2000.webm 






[2010] time live:51, cumulated reward: 4.099999999999998, loss: -0.0010855805594474077
[2020] time live:46, cumulated reward: 3.5999999999999996, loss: 0.0007238802500069141
[2030] time live:58, cumulated reward: 4.799999999999995, loss: -0.00018922213348560035
[2040] time live:48, cumulated reward: 3.799999999999999, loss: 0.005629757884889841
[2050] time live:61, cumulated reward: 5.099999999999994, loss: -0.0011128598125651479
[2060] time live:61, cumulated reward: 5.099999999999994, loss: 0.00041189350304193795
[2070] time live:53, cumulated reward: 4.299999999999997, loss: 0.0007746354676783085
[2080] time live:61, cumulated reward: 5.099999999999994, loss: -0.00020258543372619897
[2090] time live:46, cumulated reward: 3.5999999999999996, loss: 0.0006895687547512352
[2100] time live:58, cumulated reward: 4.799999999999995, loss: 0.0005665812059305608
[2110] time live:52, cumulated reward: 4.1999999999999975, loss: 0.001377179054543376
[2120] time live:51, cumulated reward: 4.09999

[2960] time live:49, cumulated reward: 3.8999999999999986, loss: 0.0008807279518805444
[2970] time live:44, cumulated reward: 3.4000000000000004, loss: 0.004640514031052589
[2980] time live:59, cumulated reward: 4.899999999999995, loss: -0.011111793108284473
[2990] time live:49, cumulated reward: 3.8999999999999986, loss: 0.003272659843787551
[3000] time live:55, cumulated reward: 4.4999999999999964, loss: 0.0005403518443927169
[MoviePy] >>>> Building video movie/pg_3000.webm
[MoviePy] Writing video movie/pg_3000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 96.04it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3000.webm 






[3010] time live:43, cumulated reward: 3.3000000000000007, loss: 0.004695182666182518
[3020] time live:41, cumulated reward: 3.1000000000000014, loss: 0.0017579939449205995
[3030] time live:59, cumulated reward: 4.899999999999995, loss: -0.0033012002240866423
[3040] time live:58, cumulated reward: 4.799999999999995, loss: -0.003000752767547965
[3050] time live:51, cumulated reward: 4.099999999999998, loss: -0.0011575175449252129
[3060] time live:61, cumulated reward: 5.099999999999994, loss: 0.007821567356586456
[3070] time live:61, cumulated reward: 5.099999999999994, loss: -0.0050473446026444435
[3080] time live:55, cumulated reward: 4.4999999999999964, loss: 0.004301695618778467
[3090] time live:66, cumulated reward: 6.5999999999999925, loss: -0.00014227809151634574
[3100] time live:50, cumulated reward: 3.9999999999999982, loss: -0.0036941529251635075
[3110] time live:55, cumulated reward: 4.4999999999999964, loss: -3.81643112632446e-05
[3120] time live:58, cumulated reward: 4.7999

[3970] time live:61, cumulated reward: 5.099999999999994, loss: -0.010448174551129341
[3980] time live:38, cumulated reward: 2.8000000000000025, loss: -0.00225638085976243
[3990] time live:68, cumulated reward: 6.799999999999992, loss: 0.00618237629532814
[4000] time live:41, cumulated reward: 3.1000000000000014, loss: -5.0474959607527126e-06
[MoviePy] >>>> Building video movie/pg_4000.webm
[MoviePy] Writing video movie/pg_4000.webm


 98%|█████████▊| 42/43 [00:00<00:00, 120.76it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4000.webm 






[4010] time live:61, cumulated reward: 5.099999999999994, loss: 0.0030989726074039936
[4020] time live:61, cumulated reward: 5.099999999999994, loss: -0.009157618507742882
[4030] time live:58, cumulated reward: 4.799999999999995, loss: -0.003643627744168043
[4040] time live:40, cumulated reward: 3.0000000000000018, loss: 0.004952597431838512
[4050] time live:64, cumulated reward: 5.399999999999993, loss: -0.00013828277587890625
[4060] time live:43, cumulated reward: 3.3000000000000007, loss: 0.005081199109554291
[4070] time live:64, cumulated reward: 5.399999999999993, loss: -0.0024765431880950928
[4080] time live:58, cumulated reward: 4.799999999999995, loss: 0.008059435524046421
[4090] time live:64, cumulated reward: 5.399999999999993, loss: -0.0013894140720367432
[4100] time live:60, cumulated reward: 4.999999999999995, loss: -0.0016163508407771587
[4110] time live:54, cumulated reward: 4.399999999999997, loss: 0.003495799144729972
[4120] time live:49, cumulated reward: 3.8999999999

[4970] time live:47, cumulated reward: 3.6999999999999993, loss: 0.010073885321617126
[4980] time live:57, cumulated reward: 4.699999999999996, loss: 0.0017016895581036806
[4990] time live:41, cumulated reward: 3.1000000000000014, loss: 0.0038184653967618942
[5000] time live:53, cumulated reward: 4.299999999999997, loss: -0.002195538254454732
[MoviePy] >>>> Building video movie/pg_5000.webm
[MoviePy] Writing video movie/pg_5000.webm


 98%|█████████▊| 54/55 [00:00<00:00, 108.58it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5000.webm 






[5010] time live:61, cumulated reward: 5.099999999999994, loss: 0.0010620179818943143
[5020] time live:61, cumulated reward: 5.099999999999994, loss: 0.004607497714459896
[5030] time live:43, cumulated reward: 3.3000000000000007, loss: 0.0021977978758513927
[5040] time live:43, cumulated reward: 3.3000000000000007, loss: -0.002037137048318982
[5050] time live:52, cumulated reward: 4.1999999999999975, loss: 0.0010158282238990068
[5060] time live:68, cumulated reward: 6.799999999999992, loss: 0.0025881598703563213
[5070] time live:44, cumulated reward: 3.4000000000000004, loss: 0.003969647455960512
[5080] time live:52, cumulated reward: 4.1999999999999975, loss: -0.0014655039412900805
[5090] time live:48, cumulated reward: 3.799999999999999, loss: -0.0021945040207356215
[5100] time live:50, cumulated reward: 3.9999999999999982, loss: -0.0010714340023696423
[5110] time live:48, cumulated reward: 3.799999999999999, loss: 0.006292164325714111
[5120] time live:55, cumulated reward: 4.4999999

[5960] time live:48, cumulated reward: 3.799999999999999, loss: 0.0034948389511555433
[5970] time live:55, cumulated reward: 4.4999999999999964, loss: 0.004729860462248325
[5980] time live:52, cumulated reward: 4.1999999999999975, loss: 0.004492264706641436
[5990] time live:52, cumulated reward: 4.1999999999999975, loss: 0.004127667285501957
[6000] time live:55, cumulated reward: 4.4999999999999964, loss: 0.00045259649050422013
[MoviePy] >>>> Building video movie/pg_6000.webm
[MoviePy] Writing video movie/pg_6000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 103.96it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6000.webm 






[6010] time live:61, cumulated reward: 5.099999999999994, loss: 0.005616735201328993
[6020] time live:61, cumulated reward: 5.099999999999994, loss: -0.0004671753558795899
[6030] time live:61, cumulated reward: 5.099999999999994, loss: -0.0037575080059468746
[6040] time live:47, cumulated reward: 3.6999999999999993, loss: -0.002161452081054449
[6050] time live:49, cumulated reward: 3.8999999999999986, loss: -0.0013716950779780746
[6060] time live:57, cumulated reward: 4.699999999999996, loss: -0.00526485126465559
[6070] time live:65, cumulated reward: 6.499999999999993, loss: -0.00375206652097404
[6080] time live:60, cumulated reward: 4.999999999999995, loss: 0.00531400041654706
[6090] time live:49, cumulated reward: 3.8999999999999986, loss: 2.46982181124622e-05
[6100] time live:61, cumulated reward: 5.099999999999994, loss: -0.001452492899261415
[6110] time live:58, cumulated reward: 4.799999999999995, loss: -0.0001474906603107229
[6120] time live:37, cumulated reward: 2.700000000000

[6970] time live:43, cumulated reward: 3.3000000000000007, loss: 0.00026226043701171875
[6980] time live:61, cumulated reward: 5.099999999999994, loss: 0.00039432087214663625
[6990] time live:48, cumulated reward: 3.799999999999999, loss: 0.0066939592361450195
[7000] time live:42, cumulated reward: 3.200000000000001, loss: -0.002009232761338353
[MoviePy] >>>> Building video movie/pg_7000.webm
[MoviePy] Writing video movie/pg_7000.webm


 98%|█████████▊| 43/44 [00:00<00:00, 94.91it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7000.webm 






[7010] time live:61, cumulated reward: 5.099999999999994, loss: -0.004918739665299654
[7020] time live:37, cumulated reward: 2.7000000000000024, loss: -0.008118178695440292
[7030] time live:61, cumulated reward: 5.099999999999994, loss: -0.0056492541916668415
[7040] time live:61, cumulated reward: 5.099999999999994, loss: 5.709538527298719e-05
[7050] time live:48, cumulated reward: 3.799999999999999, loss: -0.003971537109464407
[7060] time live:66, cumulated reward: 6.5999999999999925, loss: 0.006135052070021629
[7070] time live:52, cumulated reward: 4.1999999999999975, loss: -0.0007041784701868892
[7080] time live:49, cumulated reward: 3.8999999999999986, loss: 0.003909967374056578
[7090] time live:52, cumulated reward: 4.1999999999999975, loss: 0.00482113566249609
[7100] time live:53, cumulated reward: 4.299999999999997, loss: -0.007205369416624308
[7110] time live:47, cumulated reward: 3.6999999999999993, loss: 0.001085849478840828
[7120] time live:62, cumulated reward: 5.1999999999

[7980] time live:61, cumulated reward: 5.099999999999994, loss: -0.00012660417996812612
[7990] time live:61, cumulated reward: 5.099999999999994, loss: 0.0006923050386831164
[8000] time live:59, cumulated reward: 4.899999999999995, loss: 0.0013938839547336102
[MoviePy] >>>> Building video movie/pg_8000.webm
[MoviePy] Writing video movie/pg_8000.webm


 98%|█████████▊| 60/61 [00:00<00:00, 102.31it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8000.webm 






[8010] time live:59, cumulated reward: 4.899999999999995, loss: 0.0027730262372642756
[8020] time live:43, cumulated reward: 3.3000000000000007, loss: -0.003756767138838768
[8030] time live:49, cumulated reward: 3.8999999999999986, loss: 0.0022393441759049892
[8040] time live:59, cumulated reward: 4.899999999999995, loss: -0.006941391155123711
[8050] time live:54, cumulated reward: 4.399999999999997, loss: -0.005825148895382881
[8060] time live:58, cumulated reward: 4.799999999999995, loss: 0.004996694158762693
[8070] time live:57, cumulated reward: 4.699999999999996, loss: 0.009318937547504902
[8080] time live:45, cumulated reward: 3.5, loss: 0.008571879006922245
[8090] time live:52, cumulated reward: 4.1999999999999975, loss: 0.004435429349541664
[8100] time live:61, cumulated reward: 5.099999999999994, loss: 0.002637112746015191
[8110] time live:55, cumulated reward: 4.4999999999999964, loss: -0.00335381249897182
[8120] time live:42, cumulated reward: 3.200000000000001, loss: 0.0024

[8970] time live:54, cumulated reward: 4.399999999999997, loss: -0.0035533905029296875
[8980] time live:60, cumulated reward: 4.999999999999995, loss: 0.003529961919412017
[8990] time live:51, cumulated reward: 4.099999999999998, loss: 0.003525060834363103
[9000] time live:61, cumulated reward: 5.099999999999994, loss: -0.002665691776201129
[MoviePy] >>>> Building video movie/pg_9000.webm
[MoviePy] Writing video movie/pg_9000.webm


100%|██████████| 63/63 [00:00<00:00, 95.84it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9000.webm 






[9010] time live:48, cumulated reward: 3.799999999999999, loss: 0.0030387442093342543
[9020] time live:48, cumulated reward: 3.799999999999999, loss: 0.002744515659287572
[9030] time live:43, cumulated reward: 3.3000000000000007, loss: -0.0003251142334192991
[9040] time live:56, cumulated reward: 4.599999999999996, loss: -0.0025515896268188953
[9050] time live:51, cumulated reward: 4.099999999999998, loss: -0.002781307091936469
[9060] time live:55, cumulated reward: 4.4999999999999964, loss: 0.005433602724224329
[9070] time live:55, cumulated reward: 4.4999999999999964, loss: -0.00027812609914690256
[9080] time live:42, cumulated reward: 3.200000000000001, loss: 0.0002419835072942078
[9090] time live:54, cumulated reward: 4.399999999999997, loss: -0.0009738780790939927
[9100] time live:57, cumulated reward: 4.699999999999996, loss: 0.00025531701976433396
[9110] time live:51, cumulated reward: 4.099999999999998, loss: -0.003249748144298792
[9120] time live:61, cumulated reward: 5.099999

[9970] time live:48, cumulated reward: 3.799999999999999, loss: -0.007439871784299612
[9980] time live:75, cumulated reward: 7.499999999999989, loss: -0.0043761697597801685
[9990] time live:48, cumulated reward: 3.799999999999999, loss: 0.0027879674453288317
[10000] time live:43, cumulated reward: 3.3000000000000007, loss: 0.0047319987788796425
[MoviePy] >>>> Building video movie/pg_10000.webm
[MoviePy] Writing video movie/pg_10000.webm


 98%|█████████▊| 44/45 [00:00<00:00, 110.60it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_10000.webm 






[10010] time live:48, cumulated reward: 3.799999999999999, loss: -0.004287858959287405
[10020] time live:48, cumulated reward: 3.799999999999999, loss: -0.0037931203842163086
[10030] time live:59, cumulated reward: 4.899999999999995, loss: 0.0037745137233287096
[10040] time live:54, cumulated reward: 4.399999999999997, loss: 0.0034545084927231073
[10050] time live:56, cumulated reward: 4.599999999999996, loss: 0.0019984585233032703
[10060] time live:58, cumulated reward: 4.799999999999995, loss: -0.0016062177019193769
[10070] time live:61, cumulated reward: 5.099999999999994, loss: 0.003547480795532465
[10080] time live:61, cumulated reward: 5.099999999999994, loss: 0.00023082045663613826
[10090] time live:45, cumulated reward: 3.5, loss: -0.0037144978996366262
[10100] time live:60, cumulated reward: 4.999999999999995, loss: -0.0016222635749727488
[10110] time live:52, cumulated reward: 4.1999999999999975, loss: 0.00473535992205143
[10120] time live:61, cumulated reward: 5.099999999999

[10960] time live:61, cumulated reward: 5.099999999999994, loss: -0.0013450873084366322
[10970] time live:43, cumulated reward: 3.3000000000000007, loss: 7.707019540248439e-05
[10980] time live:61, cumulated reward: 5.099999999999994, loss: 0.002400538884103298
[10990] time live:61, cumulated reward: 5.099999999999994, loss: 0.00041702145244926214
[11000] time live:68, cumulated reward: 6.799999999999992, loss: -0.0062342179007828236
[MoviePy] >>>> Building video movie/pg_11000.webm
[MoviePy] Writing video movie/pg_11000.webm


 99%|█████████▊| 69/70 [00:00<00:00, 109.39it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_11000.webm 






[11010] time live:58, cumulated reward: 4.799999999999995, loss: 0.0020069582387804985
[11020] time live:50, cumulated reward: 3.9999999999999982, loss: -0.008200893178582191
[11030] time live:47, cumulated reward: 3.6999999999999993, loss: 0.017045285552740097
[11040] time live:47, cumulated reward: 3.6999999999999993, loss: -0.007395581807941198
[11050] time live:49, cumulated reward: 3.8999999999999986, loss: 0.007392163388431072
[11060] time live:55, cumulated reward: 4.4999999999999964, loss: 0.0006497816648334265
[11070] time live:51, cumulated reward: 4.099999999999998, loss: 0.004184423480182886
[11080] time live:49, cumulated reward: 3.8999999999999986, loss: 0.001879186020232737
[11090] time live:50, cumulated reward: 3.9999999999999982, loss: 0.002865810412913561
[11100] time live:53, cumulated reward: 4.299999999999997, loss: -0.01655270904302597
[11110] time live:61, cumulated reward: 5.099999999999994, loss: -0.01086350716650486
[11120] time live:38, cumulated reward: 2.8

[11960] time live:44, cumulated reward: 3.4000000000000004, loss: -0.02395445667207241
[11970] time live:50, cumulated reward: 3.9999999999999982, loss: 0.000508804339915514
[11980] time live:61, cumulated reward: 5.099999999999994, loss: -0.0033039655536413193
[11990] time live:43, cumulated reward: 3.3000000000000007, loss: 0.003615401452407241
[12000] time live:47, cumulated reward: 3.6999999999999993, loss: 0.011913847178220749
[MoviePy] >>>> Building video movie/pg_12000.webm
[MoviePy] Writing video movie/pg_12000.webm


 98%|█████████▊| 48/49 [00:00<00:00, 106.81it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_12000.webm 






[12010] time live:61, cumulated reward: 5.099999999999994, loss: -0.008105043321847916
[12020] time live:61, cumulated reward: 5.099999999999994, loss: -0.007403201889246702
[12030] time live:61, cumulated reward: 5.099999999999994, loss: 0.008294496685266495
[12040] time live:55, cumulated reward: 4.4999999999999964, loss: 0.004586375784128904
[12050] time live:48, cumulated reward: 3.799999999999999, loss: 0.0033828418236225843
[12060] time live:47, cumulated reward: 3.6999999999999993, loss: -0.0027227401733398438
[12070] time live:51, cumulated reward: 4.099999999999998, loss: -0.003041809657588601
[12080] time live:52, cumulated reward: 4.1999999999999975, loss: -0.00441617239266634
[12090] time live:50, cumulated reward: 3.9999999999999982, loss: 0.014232502318918705
[12100] time live:56, cumulated reward: 4.599999999999996, loss: 0.001098258187994361
[12110] time live:54, cumulated reward: 4.399999999999997, loss: -0.0026932116597890854
[12120] time live:35, cumulated reward: 2.

[12960] time live:55, cumulated reward: 4.4999999999999964, loss: 0.01579803042113781
[12970] time live:46, cumulated reward: 3.5999999999999996, loss: 0.00039106866461224854
[12980] time live:61, cumulated reward: 5.099999999999994, loss: -0.014859340153634548
[12990] time live:47, cumulated reward: 3.6999999999999993, loss: -0.01360095851123333
[13000] time live:55, cumulated reward: 4.4999999999999964, loss: -0.006010142154991627
[MoviePy] >>>> Building video movie/pg_13000.webm
[MoviePy] Writing video movie/pg_13000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 112.58it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_13000.webm 






[13010] time live:57, cumulated reward: 4.699999999999996, loss: -0.0031474262941628695
[13020] time live:56, cumulated reward: 4.599999999999996, loss: 0.0077416556887328625
[13030] time live:61, cumulated reward: 5.099999999999994, loss: -0.013155264779925346
[13040] time live:52, cumulated reward: 4.1999999999999975, loss: 0.0004287866468075663
[13050] time live:42, cumulated reward: 3.200000000000001, loss: 0.0004874184087384492
[13060] time live:61, cumulated reward: 5.099999999999994, loss: 0.011851607821881771
[13070] time live:54, cumulated reward: 4.399999999999997, loss: 0.009526022709906101
[13080] time live:52, cumulated reward: 4.1999999999999975, loss: 0.01421352569013834
[13090] time live:41, cumulated reward: 3.1000000000000014, loss: 0.00029447601991705596
[13100] time live:50, cumulated reward: 3.9999999999999982, loss: 0.007221469655632973
[13110] time live:42, cumulated reward: 3.200000000000001, loss: 0.007197380065917969
[13120] time live:59, cumulated reward: 4.8

[13960] time live:60, cumulated reward: 4.999999999999995, loss: 0.003632068634033203
[13970] time live:51, cumulated reward: 4.099999999999998, loss: -0.0014151965733617544
[13980] time live:55, cumulated reward: 4.4999999999999964, loss: -0.0019200064707547426
[13990] time live:61, cumulated reward: 5.099999999999994, loss: 0.005346767138689756
[14000] time live:61, cumulated reward: 5.099999999999994, loss: 0.0014102185377851129
[MoviePy] >>>> Building video movie/pg_14000.webm
[MoviePy] Writing video movie/pg_14000.webm


100%|██████████| 63/63 [00:00<00:00, 95.40it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_14000.webm 






[14010] time live:61, cumulated reward: 5.099999999999994, loss: 0.011417764239013195
[14020] time live:61, cumulated reward: 5.099999999999994, loss: 0.00752542819827795
[14030] time live:56, cumulated reward: 4.599999999999996, loss: -0.014779976569116116
[14040] time live:47, cumulated reward: 3.6999999999999993, loss: -0.017137588933110237
[14050] time live:41, cumulated reward: 3.1000000000000014, loss: -0.005296753719449043
[14060] time live:49, cumulated reward: 3.8999999999999986, loss: -0.003943871706724167
[14070] time live:46, cumulated reward: 3.5999999999999996, loss: -0.00011537386308191344
[14080] time live:40, cumulated reward: 3.0000000000000018, loss: 0.000385606283089146
[14090] time live:52, cumulated reward: 4.1999999999999975, loss: -0.004930386319756508
[14100] time live:46, cumulated reward: 3.5999999999999996, loss: 0.012616634368896484
[14110] time live:52, cumulated reward: 4.1999999999999975, loss: 0.012171965092420578
[14120] time live:60, cumulated reward:

[14960] time live:61, cumulated reward: 5.099999999999994, loss: -0.018258798867464066
[14970] time live:45, cumulated reward: 3.5, loss: -0.004482184536755085
[14980] time live:54, cumulated reward: 4.399999999999997, loss: 0.037695303559303284
[14990] time live:47, cumulated reward: 3.6999999999999993, loss: -0.003308336716145277
[15000] time live:42, cumulated reward: 3.200000000000001, loss: -0.03721866011619568
[MoviePy] >>>> Building video movie/pg_15000.webm
[MoviePy] Writing video movie/pg_15000.webm


 98%|█████████▊| 43/44 [00:00<00:00, 129.98it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_15000.webm 






[15010] time live:54, cumulated reward: 4.399999999999997, loss: -0.006468402221798897
[15020] time live:57, cumulated reward: 4.699999999999996, loss: 0.002250537509098649
[15030] time live:56, cumulated reward: 4.599999999999996, loss: -0.0013338497374206781
[15040] time live:54, cumulated reward: 4.399999999999997, loss: -0.0034569103736430407
[15050] time live:70, cumulated reward: 6.999999999999991, loss: -0.019947474822402
[15060] time live:52, cumulated reward: 4.1999999999999975, loss: 0.00954829715192318
[15070] time live:41, cumulated reward: 3.1000000000000014, loss: -0.03334866464138031
[15080] time live:43, cumulated reward: 3.3000000000000007, loss: -0.029172342270612717
[15090] time live:53, cumulated reward: 4.299999999999997, loss: 0.00659125717356801
[15100] time live:54, cumulated reward: 4.399999999999997, loss: 0.007210678420960903
[15110] time live:61, cumulated reward: 5.099999999999994, loss: 0.005799809470772743
[15120] time live:55, cumulated reward: 4.4999999

[15960] time live:61, cumulated reward: 5.099999999999994, loss: 0.02363952249288559
[15970] time live:47, cumulated reward: 3.6999999999999993, loss: -0.025120126083493233
[15980] time live:45, cumulated reward: 3.5, loss: -0.02377641387283802
[15990] time live:49, cumulated reward: 3.8999999999999986, loss: 0.024384187534451485
[16000] time live:59, cumulated reward: 4.899999999999995, loss: -0.022552845999598503
[MoviePy] >>>> Building video movie/pg_16000.webm
[MoviePy] Writing video movie/pg_16000.webm


 98%|█████████▊| 60/61 [00:00<00:00, 104.69it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_16000.webm 






[16010] time live:48, cumulated reward: 3.799999999999999, loss: 0.019609352573752403
[16020] time live:42, cumulated reward: 3.200000000000001, loss: -0.02791961096227169
[16030] time live:48, cumulated reward: 3.799999999999999, loss: 0.015229344367980957
[16040] time live:46, cumulated reward: 3.5999999999999996, loss: -0.030093027278780937
[16050] time live:68, cumulated reward: 6.799999999999992, loss: 0.005607289262115955
[16060] time live:40, cumulated reward: 3.0000000000000018, loss: -0.016986440867185593
[16070] time live:61, cumulated reward: 5.099999999999994, loss: 0.0045547797344625
[16080] time live:58, cumulated reward: 4.799999999999995, loss: -0.014283871278166771
[16090] time live:74, cumulated reward: 7.39999999999999, loss: 0.006257997825741768
[16100] time live:57, cumulated reward: 4.699999999999996, loss: -0.033656369894742966
[16110] time live:44, cumulated reward: 3.4000000000000004, loss: -0.01307461503893137
[16120] time live:54, cumulated reward: 4.39999999

[16960] time live:58, cumulated reward: 4.799999999999995, loss: -0.010761589743196964
[16970] time live:40, cumulated reward: 3.0000000000000018, loss: -0.039523255079984665
[16980] time live:50, cumulated reward: 3.9999999999999982, loss: -0.013645286671817303
[16990] time live:55, cumulated reward: 4.4999999999999964, loss: -0.01023804023861885
[17000] time live:49, cumulated reward: 3.8999999999999986, loss: 0.027442485094070435
[MoviePy] >>>> Building video movie/pg_17000.webm
[MoviePy] Writing video movie/pg_17000.webm


 98%|█████████▊| 50/51 [00:00<00:00, 106.34it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_17000.webm 






[17010] time live:49, cumulated reward: 3.8999999999999986, loss: 5.720099579775706e-05
[17020] time live:57, cumulated reward: 4.699999999999996, loss: 0.041879888623952866
[17030] time live:61, cumulated reward: 5.099999999999994, loss: -0.044194892048835754
[17040] time live:57, cumulated reward: 4.699999999999996, loss: -0.0131467180326581
[17050] time live:43, cumulated reward: 3.3000000000000007, loss: -0.01582127995789051
[17060] time live:36, cumulated reward: 2.6000000000000023, loss: -0.003231578506529331
[17070] time live:59, cumulated reward: 4.899999999999995, loss: -0.06822660565376282
[17080] time live:38, cumulated reward: 2.8000000000000025, loss: -0.03748194873332977
[17090] time live:39, cumulated reward: 2.900000000000002, loss: -0.030372032895684242
[17100] time live:50, cumulated reward: 3.9999999999999982, loss: -0.02778257429599762
[17110] time live:46, cumulated reward: 3.5999999999999996, loss: -0.03213341161608696
[17120] time live:61, cumulated reward: 5.099

[17960] time live:61, cumulated reward: 5.099999999999994, loss: 0.017759542912244797
[17970] time live:64, cumulated reward: 5.399999999999993, loss: 0.005757272243499756
[17980] time live:57, cumulated reward: 4.699999999999996, loss: -0.029617661610245705
[17990] time live:61, cumulated reward: 5.099999999999994, loss: -0.041530296206474304
[18000] time live:50, cumulated reward: 3.9999999999999982, loss: 0.004865875467658043
[MoviePy] >>>> Building video movie/pg_18000.webm
[MoviePy] Writing video movie/pg_18000.webm


 98%|█████████▊| 51/52 [00:00<00:00, 112.10it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_18000.webm 






[18010] time live:41, cumulated reward: 3.1000000000000014, loss: -0.011036965996026993
[18020] time live:48, cumulated reward: 3.799999999999999, loss: -0.014940321445465088
[18030] time live:56, cumulated reward: 4.599999999999996, loss: 0.022538525983691216
[18040] time live:44, cumulated reward: 3.4000000000000004, loss: -0.06820431351661682
[18050] time live:61, cumulated reward: 5.099999999999994, loss: 0.03238077461719513
[18060] time live:43, cumulated reward: 3.3000000000000007, loss: -0.019099190831184387
[18070] time live:50, cumulated reward: 3.9999999999999982, loss: -0.0483202189207077
[18080] time live:53, cumulated reward: 4.299999999999997, loss: 0.032481320202350616
[18090] time live:55, cumulated reward: 4.4999999999999964, loss: -0.0077180517837405205
[18100] time live:55, cumulated reward: 4.4999999999999964, loss: -0.024495817720890045
[18110] time live:46, cumulated reward: 3.5999999999999996, loss: -0.05627458170056343
[18120] time live:61, cumulated reward: 5.0

[18960] time live:62, cumulated reward: 5.199999999999994, loss: -0.028714025393128395
[18970] time live:61, cumulated reward: 5.099999999999994, loss: -0.040662139654159546
[18980] time live:45, cumulated reward: 3.5, loss: -0.026731278747320175
[18990] time live:61, cumulated reward: 5.099999999999994, loss: -0.04552096873521805
[19000] time live:55, cumulated reward: 4.4999999999999964, loss: -0.019964080303907394
[MoviePy] >>>> Building video movie/pg_19000.webm
[MoviePy] Writing video movie/pg_19000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 110.21it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_19000.webm 






[19010] time live:42, cumulated reward: 3.200000000000001, loss: 0.0069926124997437
[19020] time live:57, cumulated reward: 4.699999999999996, loss: -0.01136108860373497
[19030] time live:44, cumulated reward: 3.4000000000000004, loss: -0.011077490635216236
[19040] time live:54, cumulated reward: 4.399999999999997, loss: 0.012771429494023323
[19050] time live:61, cumulated reward: 5.099999999999994, loss: 0.016009220853447914
[19060] time live:43, cumulated reward: 3.3000000000000007, loss: -0.047328196465969086
[19070] time live:58, cumulated reward: 4.799999999999995, loss: -0.017839957028627396
[19080] time live:51, cumulated reward: 4.099999999999998, loss: 0.002832618309184909
[19090] time live:69, cumulated reward: 6.8999999999999915, loss: 0.009678695350885391
[19100] time live:43, cumulated reward: 3.3000000000000007, loss: 0.0523822046816349
[19110] time live:38, cumulated reward: 2.8000000000000025, loss: 0.02593187242746353
[19120] time live:42, cumulated reward: 3.200000000

[19960] time live:50, cumulated reward: 3.9999999999999982, loss: -0.028034476563334465
[19970] time live:51, cumulated reward: 4.099999999999998, loss: 0.019505763426423073
[19980] time live:53, cumulated reward: 4.299999999999997, loss: -0.006329122465103865
[19990] time live:50, cumulated reward: 3.9999999999999982, loss: -0.04669244587421417
[20000] time live:58, cumulated reward: 4.799999999999995, loss: -0.013038931414484978
[MoviePy] >>>> Building video movie/pg_20000.webm
[MoviePy] Writing video movie/pg_20000.webm


 98%|█████████▊| 59/60 [00:00<00:00, 92.23it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_20000.webm 






[20010] time live:61, cumulated reward: 5.099999999999994, loss: -0.03143701329827309
[20020] time live:50, cumulated reward: 3.9999999999999982, loss: -0.037419699132442474
[20030] time live:43, cumulated reward: 3.3000000000000007, loss: -0.025668876245617867
[20040] time live:48, cumulated reward: 3.799999999999999, loss: -0.012172996997833252
[20050] time live:48, cumulated reward: 3.799999999999999, loss: -0.005327621940523386
[20060] time live:48, cumulated reward: 3.799999999999999, loss: -0.0009258190984837711
[20070] time live:38, cumulated reward: 2.8000000000000025, loss: -0.05382103845477104
[20080] time live:61, cumulated reward: 5.099999999999994, loss: -0.014465081505477428
[20090] time live:72, cumulated reward: 7.19999999999999, loss: -0.01637040264904499
[20100] time live:63, cumulated reward: 5.299999999999994, loss: -0.011405732482671738
[20110] time live:61, cumulated reward: 5.099999999999994, loss: 0.030526677146553993
[20120] time live:48, cumulated reward: 3.79

[20960] time live:56, cumulated reward: 4.599999999999996, loss: -0.043443746864795685
[20970] time live:42, cumulated reward: 3.200000000000001, loss: 0.009562583640217781
[20980] time live:59, cumulated reward: 4.899999999999995, loss: 0.02662862278521061
[20990] time live:61, cumulated reward: 5.099999999999994, loss: 0.0010384732158854604
[21000] time live:57, cumulated reward: 4.699999999999996, loss: -0.01317688450217247
[MoviePy] >>>> Building video movie/pg_21000.webm
[MoviePy] Writing video movie/pg_21000.webm


 98%|█████████▊| 58/59 [00:00<00:00, 124.20it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_21000.webm 






[21010] time live:57, cumulated reward: 4.699999999999996, loss: 0.004909481853246689
[21020] time live:61, cumulated reward: 5.099999999999994, loss: -0.0067143989726901054
[21030] time live:61, cumulated reward: 5.099999999999994, loss: -0.018974367529153824
[21040] time live:61, cumulated reward: 5.099999999999994, loss: -0.029735127463936806
[21050] time live:45, cumulated reward: 3.5, loss: -0.00777770159766078
[21060] time live:61, cumulated reward: 5.099999999999994, loss: -0.015396774746477604
[21070] time live:44, cumulated reward: 3.4000000000000004, loss: 0.013969356194138527
[21080] time live:61, cumulated reward: 5.099999999999994, loss: -0.009261522442102432
[21090] time live:58, cumulated reward: 4.799999999999995, loss: -0.03825792670249939
[21100] time live:42, cumulated reward: 3.200000000000001, loss: -0.0056580132804811
[21110] time live:37, cumulated reward: 2.7000000000000024, loss: 0.0007640091353096068
[21120] time live:56, cumulated reward: 4.599999999999996, l

[21970] time live:65, cumulated reward: 6.499999999999993, loss: -0.039741553366184235
[21980] time live:59, cumulated reward: 4.899999999999995, loss: -0.028223684057593346
[21990] time live:61, cumulated reward: 5.099999999999994, loss: -0.03818954527378082
[22000] time live:49, cumulated reward: 3.8999999999999986, loss: -0.01845025084912777
[MoviePy] >>>> Building video movie/pg_22000.webm
[MoviePy] Writing video movie/pg_22000.webm


 98%|█████████▊| 50/51 [00:00<00:00, 104.56it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_22000.webm 






[22010] time live:54, cumulated reward: 4.399999999999997, loss: -0.025195863097906113
[22020] time live:43, cumulated reward: 3.3000000000000007, loss: -0.012962762266397476
[22030] time live:47, cumulated reward: 3.6999999999999993, loss: 0.028410078957676888
[22040] time live:61, cumulated reward: 5.099999999999994, loss: 0.020966701209545135
[22050] time live:51, cumulated reward: 4.099999999999998, loss: -0.009370729327201843
[22060] time live:45, cumulated reward: 3.5, loss: 0.009473546408116817
[22070] time live:50, cumulated reward: 3.9999999999999982, loss: 0.004995041061192751
[22080] time live:34, cumulated reward: 2.400000000000002, loss: -0.032755304127931595
[22090] time live:58, cumulated reward: 4.799999999999995, loss: -0.003580257762223482
[22100] time live:61, cumulated reward: 5.099999999999994, loss: 0.012868411839008331
[22110] time live:54, cumulated reward: 4.399999999999997, loss: -0.03106330893933773
[22120] time live:57, cumulated reward: 4.699999999999996, l

[22970] time live:66, cumulated reward: 6.5999999999999925, loss: 0.00685947947204113
[22980] time live:51, cumulated reward: 4.099999999999998, loss: -0.00493008503690362
[22990] time live:56, cumulated reward: 4.599999999999996, loss: 0.0175639558583498
[23000] time live:52, cumulated reward: 4.1999999999999975, loss: -0.03961830958724022
[MoviePy] >>>> Building video movie/pg_23000.webm
[MoviePy] Writing video movie/pg_23000.webm


 98%|█████████▊| 53/54 [00:00<00:00, 120.63it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_23000.webm 






[23010] time live:51, cumulated reward: 4.099999999999998, loss: -0.022409215569496155
[23020] time live:39, cumulated reward: 2.900000000000002, loss: -0.0039302753284573555
[23030] time live:61, cumulated reward: 5.099999999999994, loss: -0.037446726113557816
[23040] time live:54, cumulated reward: 4.399999999999997, loss: -0.006830303929746151
[23050] time live:61, cumulated reward: 5.099999999999994, loss: -0.008117926307022572
[23060] time live:56, cumulated reward: 4.599999999999996, loss: -0.017595598474144936
[23070] time live:61, cumulated reward: 5.099999999999994, loss: 0.04077520594000816
[23080] time live:60, cumulated reward: 4.999999999999995, loss: -0.01564718969166279
[23090] time live:41, cumulated reward: 3.1000000000000014, loss: 0.02489301562309265
[23100] time live:61, cumulated reward: 5.099999999999994, loss: 0.0013147885911166668
[23110] time live:59, cumulated reward: 4.899999999999995, loss: -0.0076025621965527534
[23120] time live:55, cumulated reward: 4.499

[23960] time live:61, cumulated reward: 5.099999999999994, loss: -0.008764891885221004
[23970] time live:52, cumulated reward: 4.1999999999999975, loss: -0.012703382410109043
[23980] time live:42, cumulated reward: 3.200000000000001, loss: -0.0654185488820076
[23990] time live:58, cumulated reward: 4.799999999999995, loss: -0.004333397373557091
[24000] time live:50, cumulated reward: 3.9999999999999982, loss: -0.05130636319518089
[MoviePy] >>>> Building video movie/pg_24000.webm
[MoviePy] Writing video movie/pg_24000.webm


 98%|█████████▊| 51/52 [00:00<00:00, 107.85it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_24000.webm 






[24010] time live:48, cumulated reward: 3.799999999999999, loss: 0.006570279598236084
[24020] time live:48, cumulated reward: 3.799999999999999, loss: -0.013944089412689209
[24030] time live:52, cumulated reward: 4.1999999999999975, loss: 0.008427252992987633
[24040] time live:60, cumulated reward: 4.999999999999995, loss: 0.008063698187470436
[24050] time live:60, cumulated reward: 4.999999999999995, loss: 0.027084732428193092
[24060] time live:43, cumulated reward: 3.3000000000000007, loss: -0.003070188220590353
[24070] time live:45, cumulated reward: 3.5, loss: -0.0315762422978878
[24080] time live:49, cumulated reward: 3.8999999999999986, loss: -0.03666793927550316
[24090] time live:54, cumulated reward: 4.399999999999997, loss: 0.0021921794395893812
[24100] time live:44, cumulated reward: 3.4000000000000004, loss: 0.016076087951660156
[24110] time live:50, cumulated reward: 3.9999999999999982, loss: 0.013210201635956764
[24120] time live:38, cumulated reward: 2.8000000000000025, l

[24970] time live:55, cumulated reward: 4.4999999999999964, loss: -0.03392774239182472
[24980] time live:58, cumulated reward: 4.799999999999995, loss: 0.04126279056072235
[24990] time live:61, cumulated reward: 5.099999999999994, loss: -0.012423374690115452
[25000] time live:59, cumulated reward: 4.899999999999995, loss: -0.027117518708109856
[MoviePy] >>>> Building video movie/pg_25000.webm
[MoviePy] Writing video movie/pg_25000.webm


 98%|█████████▊| 60/61 [00:00<00:00, 101.45it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_25000.webm 






[25010] time live:56, cumulated reward: 4.599999999999996, loss: 0.028342416509985924
[25020] time live:53, cumulated reward: 4.299999999999997, loss: -0.02921232208609581
[25030] time live:58, cumulated reward: 4.799999999999995, loss: -0.03738582506775856
[25040] time live:45, cumulated reward: 3.5, loss: -0.022906260564923286
[25050] time live:68, cumulated reward: 6.799999999999992, loss: -0.010534741915762424
[25060] time live:50, cumulated reward: 3.9999999999999982, loss: -0.047562673687934875
[25070] time live:46, cumulated reward: 3.5999999999999996, loss: -0.005936270114034414
[25080] time live:43, cumulated reward: 3.3000000000000007, loss: 0.000888824462890625
[25090] time live:61, cumulated reward: 5.099999999999994, loss: -0.010151972994208336
[25100] time live:37, cumulated reward: 2.7000000000000024, loss: 0.006130965892225504
[25110] time live:61, cumulated reward: 5.099999999999994, loss: -0.0024603859055787325
[25120] time live:68, cumulated reward: 6.799999999999992

[25960] time live:47, cumulated reward: 3.6999999999999993, loss: -0.0038760164752602577
[25970] time live:49, cumulated reward: 3.8999999999999986, loss: -0.0009368585306219757
[25980] time live:61, cumulated reward: 5.099999999999994, loss: 0.032208144664764404
[25990] time live:52, cumulated reward: 4.1999999999999975, loss: -0.01247591245919466
[26000] time live:62, cumulated reward: 5.199999999999994, loss: 0.012773360125720501
[MoviePy] >>>> Building video movie/pg_26000.webm
[MoviePy] Writing video movie/pg_26000.webm


 98%|█████████▊| 63/64 [00:00<00:00, 97.69it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_26000.webm 






[26010] time live:61, cumulated reward: 5.099999999999994, loss: 0.009269620291888714
[26020] time live:51, cumulated reward: 4.099999999999998, loss: -0.02240949496626854
[26030] time live:59, cumulated reward: 4.899999999999995, loss: -0.0173578429967165
[26040] time live:55, cumulated reward: 4.4999999999999964, loss: 0.008074170909821987
[26050] time live:52, cumulated reward: 4.1999999999999975, loss: -0.0012652140576392412
[26060] time live:44, cumulated reward: 3.4000000000000004, loss: 0.011203180998563766
[26070] time live:61, cumulated reward: 5.099999999999994, loss: 0.006705299951136112
[26080] time live:52, cumulated reward: 4.1999999999999975, loss: -0.03545370325446129
[26090] time live:41, cumulated reward: 3.1000000000000014, loss: -0.016339417546987534
[26100] time live:51, cumulated reward: 4.099999999999998, loss: 0.0011002222308889031
[26110] time live:45, cumulated reward: 3.5, loss: -0.012276861816644669
[26120] time live:74, cumulated reward: 7.39999999999999, l

[26970] time live:60, cumulated reward: 4.999999999999995, loss: 0.0022862751502543688
[26980] time live:41, cumulated reward: 3.1000000000000014, loss: -0.011372100561857224
[26990] time live:59, cumulated reward: 4.899999999999995, loss: -0.012804677709937096
[27000] time live:41, cumulated reward: 3.1000000000000014, loss: -0.04311015084385872
[MoviePy] >>>> Building video movie/pg_27000.webm
[MoviePy] Writing video movie/pg_27000.webm


 98%|█████████▊| 42/43 [00:00<00:00, 117.59it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_27000.webm 






[27010] time live:60, cumulated reward: 4.999999999999995, loss: 0.03253011777997017
[27020] time live:61, cumulated reward: 5.099999999999994, loss: -0.007089177146553993
[27030] time live:46, cumulated reward: 3.5999999999999996, loss: -0.002473292173817754
[27040] time live:46, cumulated reward: 3.5999999999999996, loss: -0.010934787802398205
[27050] time live:57, cumulated reward: 4.699999999999996, loss: -0.0033803237602114677
[27060] time live:44, cumulated reward: 3.4000000000000004, loss: -0.00912536308169365
[27070] time live:54, cumulated reward: 4.399999999999997, loss: -0.02204451709985733
[27080] time live:59, cumulated reward: 4.899999999999995, loss: -0.015827955678105354
[27090] time live:45, cumulated reward: 3.5, loss: -0.041733380407094955
[27100] time live:65, cumulated reward: 6.499999999999993, loss: -0.011471678502857685
[27110] time live:61, cumulated reward: 5.099999999999994, loss: 0.015565621666610241
[27120] time live:61, cumulated reward: 5.099999999999994,

[27960] time live:61, cumulated reward: 5.099999999999994, loss: 0.01665293611586094
[27970] time live:71, cumulated reward: 7.099999999999991, loss: -0.04812197759747505
[27980] time live:42, cumulated reward: 3.200000000000001, loss: -0.012003036215901375
[27990] time live:52, cumulated reward: 4.1999999999999975, loss: -0.0027674345765262842
[28000] time live:53, cumulated reward: 4.299999999999997, loss: -0.016526203602552414
[MoviePy] >>>> Building video movie/pg_28000.webm
[MoviePy] Writing video movie/pg_28000.webm


 98%|█████████▊| 54/55 [00:00<00:00, 86.63it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_28000.webm 






[28010] time live:61, cumulated reward: 5.099999999999994, loss: 0.008829335682094097
[28020] time live:55, cumulated reward: 4.4999999999999964, loss: 0.01168008241802454
[28030] time live:53, cumulated reward: 4.299999999999997, loss: -0.048237547278404236
[28040] time live:61, cumulated reward: 5.099999999999994, loss: -0.001523033482953906
[28050] time live:61, cumulated reward: 5.099999999999994, loss: -0.025438934564590454
[28060] time live:60, cumulated reward: 4.999999999999995, loss: -0.009450435638427734
[28070] time live:43, cumulated reward: 3.3000000000000007, loss: -0.02357458509504795
[28080] time live:55, cumulated reward: 4.4999999999999964, loss: -0.0008418170036748052
[28090] time live:45, cumulated reward: 3.5, loss: 0.010164642706513405
[28100] time live:60, cumulated reward: 4.999999999999995, loss: -0.023056093603372574
[28110] time live:45, cumulated reward: 3.5, loss: -0.06093546375632286
[28120] time live:52, cumulated reward: 4.1999999999999975, loss: -0.0252

[28970] time live:51, cumulated reward: 4.099999999999998, loss: -0.015849748626351357
[28980] time live:52, cumulated reward: 4.1999999999999975, loss: -0.011078963056206703
[28990] time live:61, cumulated reward: 5.099999999999994, loss: -0.016978153958916664
[29000] time live:63, cumulated reward: 5.299999999999994, loss: -0.0003389933845028281
[MoviePy] >>>> Building video movie/pg_29000.webm
[MoviePy] Writing video movie/pg_29000.webm


 98%|█████████▊| 64/65 [00:00<00:00, 92.75it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_29000.webm 






[29010] time live:40, cumulated reward: 3.0000000000000018, loss: -0.05674027279019356
[29020] time live:57, cumulated reward: 4.699999999999996, loss: 0.016977477818727493
[29030] time live:72, cumulated reward: 7.19999999999999, loss: 0.009418540634214878
[29040] time live:51, cumulated reward: 4.099999999999998, loss: 0.009768298827111721
[29050] time live:61, cumulated reward: 5.099999999999994, loss: -0.0028805029578506947
[29060] time live:61, cumulated reward: 5.099999999999994, loss: 0.015320105478167534
[29070] time live:45, cumulated reward: 3.5, loss: 0.0019775815308094025
[29080] time live:60, cumulated reward: 4.999999999999995, loss: -0.030988328158855438
[29090] time live:61, cumulated reward: 5.099999999999994, loss: -0.040673647075891495
[29100] time live:61, cumulated reward: 5.099999999999994, loss: 0.014625079929828644
[29110] time live:55, cumulated reward: 4.4999999999999964, loss: 0.02301434613764286
[29120] time live:51, cumulated reward: 4.099999999999998, loss

[29960] time live:50, cumulated reward: 3.9999999999999982, loss: -0.027254618704319
[29970] time live:57, cumulated reward: 4.699999999999996, loss: -0.0026633278466761112
[29980] time live:64, cumulated reward: 5.399999999999993, loss: -0.020064860582351685
[29990] time live:52, cumulated reward: 4.1999999999999975, loss: -0.016146954149007797
[30000] time live:61, cumulated reward: 5.099999999999994, loss: -0.015461218543350697
[MoviePy] >>>> Building video movie/pg_30000.webm
[MoviePy] Writing video movie/pg_30000.webm


100%|██████████| 63/63 [00:00<00:00, 101.28it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_30000.webm 






#### 最好的結果

In [282]:
from moviepy.editor import *
clip = VideoFileClip("movie/pg_11000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

 99%|█████████▊| 69/70 [00:00<00:00, 1025.95it/s]


#### 進入第一個障礙物

In [281]:
from moviepy.editor import *
clip = VideoFileClip("movie/pg_2000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|██████████| 71/71 [00:00<00:00, 1110.16it/s]


## Actor-Critic

In [277]:
class Actor_critic:

  def __init__(self, name, num_action, discount_factor=0.99):
    self.exploring_rate = 0.1
    self.discount_factor = discount_factor
    self.num_action = num_action
    self.name = name
    self.bucket_range_per_feature = bucket_range_per_feature
    with tf.variable_scope(name):
      self.build_model()

  def get_state_idx(self, state):
    # instead of using absolute position of pipe, use relative position
    state = copy.deepcopy(state)
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']

    # sort to make list converted from dict ordered in alphabet order
    state_key = [k for k, v in sorted(state.items())]

    # do bucketing to decrease state space to speed up training
    state_idx = []
    for key in state_key:
        state_idx.append(state[key] / self.bucket_range_per_feature[key])
    return tuple(state_idx)    

  def build_model(self):
    # input: current screen, selected action and reward
    self.input_state = tf.placeholder(
        tf.float32, shape=[None, 8, num_stack])
    self.action = tf.placeholder(tf.int32, [None])
    self.reward = tf.placeholder(tf.float32, [None])
    self.is_training = tf.placeholder(tf.bool, shape=[])

    def value_net(state, reuse=False):
      with tf.variable_scope("fc"):
        for idx in range(num_stack):
          if idx == 0:
            pre_dense = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=reuse)
            pre_dense = tf.reshape(pre_dense, [-1, 8, 1])
          else:
            pre_stack = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=True)
            pre_stack = tf.reshape(pre_stack, [-1, 8, 1])
            pre_dense = tf.concat([pre_dense, pre_stack], 2) 
            
      with tf.variable_scope(
          "value_net",
          reuse=reuse,
          initializer=tf.truncated_normal_initializer(stddev=1e-2)):
        state_in = tf.reshape(pre_dense, [-1, screen_width, screen_height, num_stack]) 
        conv1 = tf.layers.conv2d(
            inputs=state_in,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        flat = tf.contrib.layers.flatten(conv3)
        dense = tf.layers.dense(inputs=flat, units=512, activation=tf.nn.relu)
        V = tf.layers.dense(inputs=dense, units=1, activation=None)
        return V

    def policy_net(state, reuse=False):
      with tf.variable_scope("policy_fc"):
        for idx in range(num_stack):
          if idx == 0:
            pre_dense = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=reuse)
            pre_dense = tf.reshape(pre_dense, [-1, 8, 1])
          else:
            pre_stack = tf.layers.dense(inputs=state[:, :, idx], units=6400, activation=tf.nn.relu, reuse=True)
            pre_stack = tf.reshape(pre_stack, [-1, 8, 1])
            pre_dense = tf.concat([pre_dense, pre_stack], 2)
            
      with tf.variable_scope("policy_net", reuse=reuse):
        state_in = tf.reshape(pre_dense, [-1, screen_width, screen_height, num_stack]) 
        conv1 = tf.layers.conv2d(
            inputs=state_in,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        self.flat = tf.contrib.layers.flatten(conv3)

        self.dense1 = tf.layers.dense(
            inputs=self.flat, units=512, activation=tf.nn.relu)
        self.dense2 = tf.layers.dense(
            inputs=self.dense1, units=self.num_action, activation=None)
        return self.dense2

    # value
    self.v_output = value_net(
        self.input_state
    )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
    self.tar_V = tf.placeholder(tf.float32, [None])
    self.V_loss = tf.reduce_mean(
        tf.square(self.reward + self.discount_factor * self.tar_V -
                  self.v_output))
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
    g_gvs = optimizer.compute_gradients(
        self.V_loss,
        var_list=[v for v in tf.global_variables() if 'value_net' in v.name])
    self.V_train_op = optimizer.apply_gradients(g_gvs)

    # policy
    self.policy_logit = policy_net(
        self.input_state
    )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
    index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
    self.prob = tf.gather_nd(
        tf.nn.softmax(self.policy_logit),
        index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

    # loss = E[log(p(s,a))*r]
    self.policy_loss = -tf.reduce_mean(
        tf.log(self.prob + 0.00000001) * self.reward)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
    g_gvs = optimizer.compute_gradients(
        self.policy_loss,
        var_list=[v for v in tf.global_variables() if 'policy_net' in v.name])
    self.train_op = optimizer.apply_gradients(g_gvs)
    self.pred = tf.multinomial(self.policy_logit,
                               1)  # sample action from distribution

  def select_action(self, input_state, sess):
    input_state = np.array(input_state).transpose([1, 0])
    feed_dict = {
        self.input_state: input_state[None, :],
    }
    action = sess.run(
        self.pred,
        feed_dict=feed_dict)[0][0]  # sameple action from distribution
    return action

  def update_policy(self, input_state, actions, rewards, input_states_plum):
    feed_dict = {
        self.input_state: np.array(input_states_plum).transpose([0, 2, 1]),
    }
    esti_V = sess.run(self.v_output, feed_dict=feed_dict).flatten()
    td_target = rewards + self.discount_factor * esti_V

    feed_dict = {
        self.input_state: np.array(input_state).transpose([0, 2, 1]),
    }
    esti_V = sess.run(self.v_output, feed_dict=feed_dict).flatten()
    td_error = td_target - esti_V
    feed_dict = {
        self.input_state: np.array(input_states_plum).transpose([0, 2, 1]),
    }
    feed_dict = {
        self.input_state: np.array(input_state).transpose([0, 2, 1]),
        self.tar_V: td_target,
        self.reward: rewards,
    }

    V_loss, _ = sess.run([self.V_loss, self.V_train_op], feed_dict=feed_dict)

    feed_dict = {
        self.input_state: np.array(input_state).transpose([0, 2, 1]),
        self.action: actions,
        self.reward: td_error,
    }
    policy_loss, _ = sess.run(
        [self.policy_loss, self.train_op], feed_dict=feed_dict)
    return V_loss, policy_loss

  def update_parameters(self, episode):
    if self.exploring_rate > MIN_EXPLORING_RATE:
      self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

  def shutdown_explore(self):
    # make action selection greedy
    self.exploring_rate = 0

In [278]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
ac_agent = Actor_critic('AC_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [279]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 1000
NUM_EPISODE = 30000
NUM_EXPLORE = 0
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
  game = FlappyBird()
  env = PLE(
      game,
      fps=30,
      display_screen=False,
      reward_values=reward_values,
      rng=np.random.RandomState(1))
  env.reset_game()
  env.act(0)  # dummy input to make sure input screen is correct

  # record frame
  if episode % save_video_every_episode == 0:
    frames = [env.getScreenRGB()]

  # grayscale input screen for this episode
  #input_screens = [preprocess(env.getScreenGrayscale())] * 4
  input_state = [ac_agent.get_state_idx(game.getGameState())] * 4
  # cumulate reward for this episode
  cum_reward = 0

  experiences = []
  t = 0
  while not env.game_over():
    # feed four previous screen, select an action
    action = ac_agent.select_action(input_state[-4:], sess)

    # execute the action and get reward
    reward = env.act(env.getActionSet()[action])

    # record frame
    if episode % save_video_every_episode == 0:
      frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # append grayscale screen for this episode
    #input_screens.append(preprocess(env.getScreenGrayscale()))
    input_state.append(ac_agent.get_state_idx(game.getGameState()))
    # append experience for this episode
    experiences.append(
        [input_state[-5:-1], action, reward, input_state[-4:]])

    t += 1

  def discount_reward(x, discount_rate):
    discounted_r = np.zeros(len(x))
    num_r = len(x)
    for i in range(num_r):
      discounted_r[i] = x[i] * math.pow(discount_rate, i)
    discounted_r = np.cumsum(discounted_r[::-1])
    return discounted_r[::-1]

  rewards = [e[2] for e in experiences]
  discounted_reward = discount_reward(rewards, ac_agent.discount_factor)

  # normalize
  discounted_reward -= np.mean(discounted_reward)
  discounted_reward /= np.std(discounted_reward)
  train_states = []
  train_actions = []
  train_rewards = []
  train_input_states_plum = []
  for i in range(len(experiences)):
    experiences[i][2] = discounted_reward[i]
    train_states.append(experiences[i][0])
    train_actions.append(experiences[i][1])
    train_rewards.append(experiences[i][2])
    train_input_states_plum.append(experiences[i][3])
  loss = ac_agent.update_policy(train_states, train_actions, train_rewards,
                                train_input_states_plum)

  if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
    print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
        episode, t, cum_reward, loss))

  if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
    clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip.write_videofile("movie/ac_{}.webm".format(episode), fps=60)
    #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[10] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601116, -0.0016315066)
[20] time live:46, cumulated reward: 3.5999999999999996, loss: (3.960111, 0.0053244676)
[30] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601085, -0.0025144333)
[40] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601057, -0.0017766174)
[50] time live:65, cumulated reward: 6.499999999999993, loss: (3.9601119, -0.001960461)
[60] time live:55, cumulated reward: 4.4999999999999964, loss: (3.960107, -0.00024115822)
[70] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601045, -0.0003839169)
[80] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601068, -0.0020105098)
[90] time live:45, cumulated reward: 3.5, loss: (3.960104, 5.086263e-06)
[100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601057, 0.001795972)
[110] time live:41, cumulated reward: 3.1000000000000014, loss: (3.960102, -0.003611588)
[120] time live:61, cumulated reward:

[940] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600987, 0.00012764276)
[950] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601, -0.0055971146)
[960] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.0012027536)
[970] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600985, 0.00356782)
[980] time live:61, cumulated reward: 5.099999999999994, loss: (3.960101, 0.0016161809)
[990] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.0008530824)
[1000] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600997, 0.0027024008)
[MoviePy] >>>> Building video movie/ac_1000.webm
[MoviePy] Writing video movie/ac_1000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 103.25it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1000.webm 

[1010] time live:35, cumulated reward: 2.500000000000002, loss: (3.9600987, -0.001048599)
[1020] time live:57, cumulated reward: 4.699999999999996, loss: (3.960101, -0.0024415401)
[1030] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601014, 0.0006099277)
[1040] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601007, -0.0026456488)
[1050] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.0010487308)
[1060] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601018, -0.0053539984)
[1070] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601004, -0.0041612387)
[1080] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600987, 0.0026848656)
[1090] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601011, 0.0036040135)
[1100] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600985, 0.0036683958)
[1110] time live:49, cumulat

[1920] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.003614113)
[1930] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, 0.0060604904)
[1940] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, 0.0029858283)
[1950] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601007, -0.0054405713)
[1960] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, 0.0020656274)
[1970] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601011, -0.0031021743)
[1980] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600997, -0.00451359)
[1990] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, -0.0047158613)
[2000] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601007, -0.0027956432)
[MoviePy] >>>> Building video movie/ac_2000.webm
[MoviePy] Writing video movie/ac_2000.webm


 98%|█████████▊| 55/56 [00:00<00:00, 114.47it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2000.webm 






[2010] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9600997, -0.0076180156)
[2020] time live:46, cumulated reward: 3.5999999999999996, loss: (3.960099, 0.00483347)
[2030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.005032555)
[2040] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600992, -0.00024437485)
[2050] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601004, 0.002932012)
[2060] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, 0.00044423706)
[2070] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600992, 0.001746533)
[2080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, 0.0051777326)
[2090] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600997, 0.004437377)
[2100] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.0011374847)
[2110] time live:72, cumulated reward: 7.19999999999999, loss: (3.9600997, 0.0010360214)
[2120] t

[2930] time live:42, cumulated reward: 3.200000000000001, loss: (3.960099, -0.0022566433)
[2940] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.0018155457)
[2950] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, 0.0032117055)
[2960] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, -0.002945828)
[2970] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, 2.5577232e-05)
[2980] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, 0.002083882)
[2990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601016, -0.003002073)
[3000] time live:66, cumulated reward: 6.5999999999999925, loss: (3.9601, 0.008645661)
[MoviePy] >>>> Building video movie/ac_3000.webm
[MoviePy] Writing video movie/ac_3000.webm


 99%|█████████▊| 67/68 [00:00<00:00, 101.00it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3000.webm 






[3010] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601014, -0.0079465145)
[3020] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600995, 0.0009121073)
[3030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.0003541415)
[3040] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.0015188922)
[3050] time live:54, cumulated reward: 4.399999999999997, loss: (3.960099, 0.0024846394)
[3060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601011, -0.0029274987)
[3070] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, -0.0036335962)
[3080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0041408227)
[3090] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600992, 0.0020156)
[3100] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, -0.0022273306)
[3110] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -0.0029117465)
[3120] ti

[3930] time live:40, cumulated reward: 3.0000000000000018, loss: (3.9600997, -0.005776167)
[3940] time live:65, cumulated reward: 6.499999999999993, loss: (3.9601004, -0.010862534)
[3950] time live:67, cumulated reward: 6.699999999999992, loss: (3.9601014, -0.009762365)
[3960] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.0053577423)
[3970] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.005505998)
[3980] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601, 0.007283775)
[3990] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601007, -0.005670679)
[4000] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, -0.012010555)
[MoviePy] >>>> Building video movie/ac_4000.webm
[MoviePy] Writing video movie/ac_4000.webm


 98%|█████████▊| 50/51 [00:00<00:00, 106.92it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4000.webm 






[4010] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.008390865)
[4020] time live:45, cumulated reward: 3.5, loss: (3.960101, -0.008930567)
[4030] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.006190647)
[4040] time live:42, cumulated reward: 3.200000000000001, loss: (3.9601, -0.004994551)
[4050] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.00573743)
[4060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.0019877073)
[4070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, 0.009517357)
[4080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.0015170614)
[4090] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, -0.010367705)
[4100] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600997, -0.0029537072)
[4110] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600992, 0.0039891857)
[4120] time live:55, cumulated

[4930] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.0066748685)
[4940] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.004789446)
[4950] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601, 0.008780861)
[4960] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.010281766)
[4970] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601007, -0.0013585325)
[4980] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600992, -0.006776099)
[4990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.0012561923)
[5000] time live:45, cumulated reward: 3.5, loss: (3.9601004, -0.008577813)
[MoviePy] >>>> Building video movie/ac_5000.webm
[MoviePy] Writing video movie/ac_5000.webm


100%|██████████| 47/47 [00:00<00:00, 93.31it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5000.webm 






[5010] time live:63, cumulated reward: 5.299999999999994, loss: (3.9601007, -0.0010302105)
[5020] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601007, -0.006247347)
[5030] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601002, 0.009067437)
[5040] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601, -0.013509664)
[5050] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, -0.0048069274)
[5060] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600992, -0.0020841789)
[5070] time live:63, cumulated reward: 5.299999999999994, loss: (3.9601004, -0.0011543395)
[5080] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9600992, 0.001259471)
[5090] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.0048581455)
[5100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0059201913)
[5110] time live:64, cumulated reward: 5.399999999999993, loss: (3.9601016, 0.004352957)
[5120

[5930] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, 0.0025404843)
[5940] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.003141117)
[5950] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.00083588617)
[5960] time live:46, cumulated reward: 3.5999999999999996, loss: (3.960099, 0.0076513705)
[5970] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, -0.0011171125)
[5980] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.008810085)
[5990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.007177697)
[6000] time live:66, cumulated reward: 6.5999999999999925, loss: (3.9600995, 0.00048070966)
[MoviePy] >>>> Building video movie/ac_6000.webm
[MoviePy] Writing video movie/ac_6000.webm


 99%|█████████▊| 67/68 [00:00<00:00, 107.25it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6000.webm 

[6010] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.0015358534)
[6020] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600992, 0.0033472346)
[6030] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, 0.008594195)
[6040] time live:52, cumulated reward: 4.1999999999999975, loss: (3.960099, 0.006973542)
[6050] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.0069474806)
[6060] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, -0.008036049)
[6070] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, 0.012469649)
[6080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.0015996401)
[6090] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.0038900843)
[6100] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600985, 0.004953898)
[6110] time live:41, cumulated r

[6930] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600992, 0.0065754163)
[6940] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600992, 0.0002952375)
[6950] time live:61, cumulated reward: 5.099999999999994, loss: (3.960099, 0.0008408906)
[6960] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601002, -0.0064949905)
[6970] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, -0.010231831)
[6980] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600997, -0.011290332)
[6990] time live:64, cumulated reward: 5.399999999999993, loss: (3.9601002, -0.004950613)
[7000] time live:45, cumulated reward: 3.5, loss: (3.960101, -0.005175612)
[MoviePy] >>>> Building video movie/ac_7000.webm
[MoviePy] Writing video movie/ac_7000.webm


100%|██████████| 47/47 [00:00<00:00, 91.26it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7000.webm 






[7010] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0007379876)
[7020] time live:45, cumulated reward: 3.5, loss: (3.9601002, -0.0035487493)
[7030] time live:39, cumulated reward: 2.900000000000002, loss: (3.960099, -0.00057605596)
[7040] time live:45, cumulated reward: 3.5, loss: (3.9601002, -0.0033668517)
[7050] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601004, 0.0061845183)
[7060] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.0064476468)
[7070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, 0.004237159)
[7080] time live:42, cumulated reward: 3.200000000000001, loss: (3.9601, -0.01852794)
[7090] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600992, 0.003746605)
[7100] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -0.004535874)
[7110] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -0.00785925)
[7120] time live:41, cumulated reward: 3.100000000

[7930] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601, 0.0024628257)
[7940] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, -0.0019441446)
[7950] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, -0.0027364695)
[7960] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.001086407)
[7970] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600992, 0.005575839)
[7980] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, -0.0027273041)
[7990] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, 0.004279555)
[8000] time live:76, cumulated reward: 7.599999999999989, loss: (3.960099, -0.0030541169)
[MoviePy] >>>> Building video movie/ac_8000.webm
[MoviePy] Writing video movie/ac_8000.webm


100%|██████████| 78/78 [00:00<00:00, 100.29it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8000.webm 






[8010] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600997, -0.0049267486)
[8020] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, -0.0028940202)
[8030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.004269209)
[8040] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.006429261)
[8050] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9601, 0.0039241924)
[8060] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.0038130705)
[8070] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601004, 0.0003491915)
[8080] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, -0.0014820276)
[8090] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -0.006227533)
[8100] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, -0.0069242306)
[8110] time live:45, cumulated reward: 3.5, loss: (3.9601, -0.0037913853)
[8120] time live:62, cumulated

[8930] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.0005781467)
[8940] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -6.7810215e-05)
[8950] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, 0.0006473405)
[8960] time live:42, cumulated reward: 3.200000000000001, loss: (3.960099, -0.0023567562)
[8970] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, -0.00040365855)
[8980] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.0025740138)
[8990] time live:53, cumulated reward: 4.299999999999997, loss: (3.960099, 0.0021591727)
[9000] time live:45, cumulated reward: 3.5, loss: (3.9600997, 0.0013554891)
[MoviePy] >>>> Building video movie/ac_9000.webm
[MoviePy] Writing video movie/ac_9000.webm


100%|██████████| 47/47 [00:00<00:00, 101.27it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9000.webm 






[9010] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.00181474)
[9020] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, -0.0048947237)
[9030] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, 0.0009665908)
[9040] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601002, -0.0005240908)
[9050] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, 0.002917618)
[9060] time live:46, cumulated reward: 3.5999999999999996, loss: (3.960099, 0.0035237437)
[9070] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, 0.00067594496)
[9080] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601007, -0.000104739745)
[9090] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, 0.001966294)
[9100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, 0.0006130719)
[9110] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.00083545403)
[9120] time

[9930] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600992, 0.00010957037)
[9940] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601, -0.0040000137)
[9950] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600992, 0.0005086702)
[9960] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, 0.002753382)
[9970] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9600985, 0.004655217)
[9980] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.0023675067)
[9990] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, 0.0015939347)
[10000] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, 0.0039151055)
[MoviePy] >>>> Building video movie/ac_10000.webm
[MoviePy] Writing video movie/ac_10000.webm


 98%|█████████▊| 50/51 [00:00<00:00, 97.98it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_10000.webm 






[10010] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.00348088)
[10020] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.0028256348)
[10030] time live:62, cumulated reward: 5.199999999999994, loss: (3.9601, 0.00358237)
[10040] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600992, -0.0003698349)
[10050] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.0008005236)
[10060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.0026418967)
[10070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.003900278)
[10080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.00073439174)
[10090] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600992, 0.005607208)
[10100] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.001338516)
[10110] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.003550032)
[101

[10920] time live:73, cumulated reward: 7.29999999999999, loss: (3.9600992, -0.004084313)
[10930] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600997, -0.005295875)
[10940] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, -0.0026273914)
[10950] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, 0.0010065691)
[10960] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601007, -0.0046751928)
[10970] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, 0.003558434)
[10980] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.0030508603)
[10990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.0043950942)
[11000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.003316254)
[MoviePy] >>>> Building video movie/ac_11000.webm
[MoviePy] Writing video movie/ac_11000.webm


100%|██████████| 63/63 [00:00<00:00, 102.00it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_11000.webm 






[11010] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.0049767415)
[11020] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600992, -0.0011316665)
[11030] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, 0.0032753414)
[11040] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601004, -0.00056765875)
[11050] time live:64, cumulated reward: 5.399999999999993, loss: (3.9601007, -0.0031464696)
[11060] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, -0.008949299)
[11070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.005045844)
[11080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0048518884)
[11090] time live:45, cumulated reward: 3.5, loss: (3.9601004, 0.0026109272)
[11100] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.0005018193)
[11110] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601002, -0.0042524976)
[111

[11920] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.008214232)
[11930] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601007, 0.004671003)
[11940] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600992, 0.004876754)
[11950] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.0013912612)
[11960] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9600997, 0.0011250973)
[11970] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.0010311162)
[11980] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600992, 0.0057678614)
[11990] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, 0.0078047253)
[12000] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600997, 5.1275212e-05)
[MoviePy] >>>> Building video movie/ac_12000.webm
[MoviePy] Writing video movie/ac_12000.webm


 98%|█████████▊| 48/49 [00:00<00:00, 101.70it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_12000.webm 






[12010] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, 0.0033033865)
[12020] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601004, 0.0007337161)
[12030] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.00045508528)
[12040] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, -0.0018669446)
[12050] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.008248738)
[12060] time live:42, cumulated reward: 3.200000000000001, loss: (3.9600997, 0.007023993)
[12070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, 0.0026998988)
[12080] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600992, -0.0025454273)
[12090] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, -0.002821569)
[12100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, 0.00049122045)
[12110] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601023, -0.00022584884)

[12930] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.0048859394)
[12940] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600992, 0.0004335642)
[12950] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.0135129355)
[12960] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600992, 0.006144138)
[12970] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.0010653028)
[12980] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, 0.0036567848)
[12990] time live:63, cumulated reward: 5.299999999999994, loss: (3.9600995, 0.007978712)
[13000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0070619425)
[MoviePy] >>>> Building video movie/ac_13000.webm
[MoviePy] Writing video movie/ac_13000.webm


100%|██████████| 63/63 [00:00<00:00, 93.86it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_13000.webm 






[13010] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, -0.0005376024)
[13020] time live:71, cumulated reward: 7.099999999999991, loss: (3.9601, 0.002928969)
[13030] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601018, -8.2114646e-05)
[13040] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, -0.0032674822)
[13050] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.00056881015)
[13060] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.003927924)
[13070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, 0.0009312239)
[13080] time live:47, cumulated reward: 3.6999999999999993, loss: (3.960099, -0.0046135923)
[13090] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.0064572003)
[13100] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, 0.0061217765)
[13110] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.004081945)
[

[13920] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600995, -0.0032431176)
[13930] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600997, 0.003948719)
[13940] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, -0.008173223)
[13950] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, -0.0031039033)
[13960] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.0048909998)
[13970] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601, -0.00066751364)
[13980] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, -0.0023540088)
[13990] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, -0.007151645)
[14000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, -0.00792347)
[MoviePy] >>>> Building video movie/ac_14000.webm
[MoviePy] Writing video movie/ac_14000.webm


100%|██████████| 63/63 [00:00<00:00, 97.04it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_14000.webm 






[14010] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.0077757025)
[14020] time live:65, cumulated reward: 6.499999999999993, loss: (3.9600995, -0.0012257466)
[14030] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601004, 0.008458348)
[14040] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.008506374)
[14050] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, 0.0018623426)
[14060] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, 0.0071735797)
[14070] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.00023822431)
[14080] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600997, -0.0028018283)
[14090] time live:38, cumulated reward: 2.8000000000000025, loss: (3.9600995, 0.0069952263)
[14100] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, 0.0029463687)
[14110] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.0035625838)

[14920] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601004, -0.0053459303)
[14930] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601004, -0.0014940379)
[14940] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, 0.001561282)
[14950] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.0038055044)
[14960] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, 0.0067576566)
[14970] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, -0.003920733)
[14980] time live:68, cumulated reward: 6.799999999999992, loss: (3.9600995, -0.0022006948)
[14990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.0040740967)
[15000] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601002, -0.0018934987)
[MoviePy] >>>> Building video movie/ac_15000.webm
[MoviePy] Writing video movie/ac_15000.webm


 98%|█████████▊| 45/46 [00:00<00:00, 103.58it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_15000.webm 






[15010] time live:40, cumulated reward: 3.0000000000000018, loss: (3.9600997, 0.004423797)
[15020] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, 0.0010868072)
[15030] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601, 0.0024918746)
[15040] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.0014625168)
[15050] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.001116393)
[15060] time live:52, cumulated reward: 4.1999999999999975, loss: (3.960099, 0.00059844897)
[15070] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601, 0.00080325385)
[15080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0071317954)
[15090] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600997, -0.010517161)
[15100] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600992, -0.0054739155)
[15110] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601007, -0.0030855

[15920] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.005962372)
[15930] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601002, 0.019462474)
[15940] time live:39, cumulated reward: 2.900000000000002, loss: (3.9601002, -0.006146969)
[15950] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, 0.0061054877)
[15960] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, 0.0046426523)
[15970] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601002, -0.004752851)
[15980] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.0068507283)
[15990] time live:68, cumulated reward: 6.799999999999992, loss: (3.9600995, 0.0025872905)
[16000] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, -0.0034251306)
[MoviePy] >>>> Building video movie/ac_16000.webm
[MoviePy] Writing video movie/ac_16000.webm


 98%|█████████▊| 52/53 [00:00<00:00, 98.82it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_16000.webm 






[16010] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601, -0.005981426)
[16020] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601, -0.012151783)
[16030] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.007169906)
[16040] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, -0.0025661036)
[16050] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -0.00036863485)
[16060] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601, 0.0020843039)
[16070] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600992, 0.002102401)
[16080] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600997, -0.0009917604)
[16090] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601, -0.008645836)
[16100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.008099572)
[16110] time live:62, cumulated reward: 5.199999999999994, loss: (3.9600997, 0.0005367648)
[16120] ti

[16920] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600992, 0.009133887)
[16930] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601002, 0.0047540134)
[16940] time live:45, cumulated reward: 3.5, loss: (3.9601004, -0.0019662858)
[16950] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600987, -0.019205093)
[16960] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600992, -0.024496237)
[16970] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601002, 0.0040131174)
[16980] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.0036060226)
[16990] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.022983246)
[17000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.0039545903)
[MoviePy] >>>> Building video movie/ac_17000.webm
[MoviePy] Writing video movie/ac_17000.webm


100%|██████████| 63/63 [00:00<00:00, 99.14it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_17000.webm 






[17010] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.0048782486)
[17020] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601004, 0.00853684)
[17030] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9600997, -0.012924172)
[17040] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.0014230647)
[17050] time live:41, cumulated reward: 3.1000000000000014, loss: (3.9601002, 0.01792403)
[17060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.002252516)
[17070] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.02227337)
[17080] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, 0.01572591)
[17090] time live:45, cumulated reward: 3.5, loss: (3.9600997, -0.0054401397)
[17100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.0005127641)
[17110] time live:46, cumulated reward: 3.5999999999999996, loss: (3.960101, 0.018390946)
[17120] time live:4

[17920] time live:45, cumulated reward: 3.5, loss: (3.9601004, 0.005281936)
[17930] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601002, 0.010292448)
[17940] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.025321284)
[17950] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.0108008655)
[17960] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600992, -0.0006839818)
[17970] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.00118265)
[17980] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.012942331)
[17990] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, -0.010241611)
[18000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.005467368)
[MoviePy] >>>> Building video movie/ac_18000.webm
[MoviePy] Writing video movie/ac_18000.webm


100%|██████████| 63/63 [00:00<00:00, 90.30it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_18000.webm 






[18010] time live:42, cumulated reward: 3.200000000000001, loss: (3.9600997, -0.00061450683)
[18020] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9600995, 0.0053575765)
[18030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.018787477)
[18040] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601, 0.010372129)
[18050] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, 0.02868756)
[18060] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, -0.0029449065)
[18070] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, 0.009684333)
[18080] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601, -0.020375887)
[18090] time live:66, cumulated reward: 6.5999999999999925, loss: (3.9600995, 0.009012786)
[18100] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, -0.027173527)
[18110] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, -0.004523709)
[18120]

[18930] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.009743658)
[18940] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601004, 0.017789586)
[18950] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, 0.05513559)
[18960] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, 0.034959145)
[18970] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.053651504)
[18980] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.00018104177)
[18990] time live:39, cumulated reward: 2.900000000000002, loss: (3.9600997, -0.014347761)
[19000] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, 0.00043566473)
[MoviePy] >>>> Building video movie/ac_19000.webm
[MoviePy] Writing video movie/ac_19000.webm


 98%|█████████▊| 59/60 [00:00<00:00, 106.62it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_19000.webm 






[19010] time live:42, cumulated reward: 3.200000000000001, loss: (3.9601004, 0.035090834)
[19020] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.00032746987)
[19030] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, -0.028421234)
[19040] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600997, -0.0017235966)
[19050] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.02811815)
[19060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.019281324)
[19070] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.02463315)
[19080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.006322392)
[19090] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600997, -0.02652744)
[19100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.023054529)
[19110] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, 0.011760802)
[1912

[19920] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.036967117)
[19930] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, 0.003801554)
[19940] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601, -0.015785066)
[19950] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.056064792)
[19960] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, 0.014947242)
[19970] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.0018422126)
[19980] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, -0.0029762269)
[19990] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.03125256)
[20000] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.008901818)
[MoviePy] >>>> Building video movie/ac_20000.webm
[MoviePy] Writing video movie/ac_20000.webm


 98%|█████████▊| 57/58 [00:00<00:00, 90.70it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_20000.webm 






[20010] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, -0.014403757)
[20020] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9601, 0.031639677)
[20030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.038423914)
[20040] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, 0.0046667503)
[20050] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.001778712)
[20060] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.035026673)
[20070] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600997, -0.05054961)
[20080] time live:41, cumulated reward: 3.1000000000000014, loss: (3.9600997, 0.056145344)
[20090] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.0049052006)
[20100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.039495952)
[20110] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, 0.004845184)
[20120] time 

[20930] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.048044845)
[20940] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, -0.03636319)
[20950] time live:45, cumulated reward: 3.5, loss: (3.9600997, -0.05049324)
[20960] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.030008705)
[20970] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.012343123)
[20980] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, -0.031017177)
[20990] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.07771249)
[21000] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, 0.034546036)
[MoviePy] >>>> Building video movie/ac_21000.webm
[MoviePy] Writing video movie/ac_21000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 105.18it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_21000.webm 

[21010] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601004, 0.06653916)
[21020] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, -0.03509308)
[21030] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, 0.018892577)
[21040] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600992, -0.02321441)
[21050] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.010764564)
[21060] time live:45, cumulated reward: 3.5, loss: (3.9601004, 0.052526664)
[21070] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, -0.0477321)
[21080] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.012159295)
[21090] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, -0.005638852)
[21100] time live:45, cumulated reward: 3.5, loss: (3.9601, 0.0029190911)
[21110] time live:61, cumulated reward: 5.099999999999994, loss:

[21920] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.016538296)
[21930] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.010969162)
[21940] time live:41, cumulated reward: 3.1000000000000014, loss: (3.9600997, -0.009767812)
[21950] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, -0.007480747)
[21960] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.01491579)
[21970] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600992, -0.050176296)
[21980] time live:69, cumulated reward: 6.8999999999999915, loss: (3.9600997, 0.014210701)
[21990] time live:90, cumulated reward: 8.999999999999984, loss: (3.9600997, -0.03125695)
[22000] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, 0.0065456536)
[MoviePy] >>>> Building video movie/ac_22000.webm
[MoviePy] Writing video movie/ac_22000.webm


 98%|█████████▊| 53/54 [00:00<00:00, 109.37it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_22000.webm 






[22010] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.012839224)
[22020] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.004014581)
[22030] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600997, 0.020696959)
[22040] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.015335577)
[22050] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.0051984317)
[22060] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, -0.013206572)
[22070] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601002, 0.020817164)
[22080] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601004, -0.02684446)
[22090] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.0028965813)
[22100] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, -0.036416683)
[22110] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.07192559)
[22120] t

[22930] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.016304892)
[22940] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.005914071)
[22950] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601002, 0.01764689)
[22960] time live:63, cumulated reward: 5.299999999999994, loss: (3.9600997, 0.005378118)
[22970] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.006720809)
[22980] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, 0.0049042106)
[22990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, 0.018110339)
[23000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.01321605)
[MoviePy] >>>> Building video movie/ac_23000.webm
[MoviePy] Writing video movie/ac_23000.webm


100%|██████████| 63/63 [00:00<00:00, 86.76it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_23000.webm 






[23010] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.019435601)
[23020] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.01048654)
[23030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.021126732)
[23040] time live:61, cumulated reward: 5.099999999999994, loss: (3.960101, 0.024280328)
[23050] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, -0.0019828668)
[23060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.043685146)
[23070] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601004, -0.015101471)
[23080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, 0.026509989)
[23090] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, -0.018769588)
[23100] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.011346169)
[23110] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.0104697095)
[23120]

[23930] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.0021348547)
[23940] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, 0.024877261)
[23950] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.017208325)
[23960] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, 0.0053037773)
[23970] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, -0.045305043)
[23980] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.036832016)
[23990] time live:39, cumulated reward: 2.900000000000002, loss: (3.9601, 0.0007720116)
[24000] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601002, 0.019084083)
[MoviePy] >>>> Building video movie/ac_24000.webm
[MoviePy] Writing video movie/ac_24000.webm


 98%|█████████▊| 55/56 [00:00<00:00, 107.26it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_24000.webm 






[24010] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601007, 0.018398458)
[24020] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, -0.037466027)
[24030] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, 0.024818989)
[24040] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.030484851)
[24050] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601007, 0.0016858083)
[24060] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.0040619285)
[24070] time live:62, cumulated reward: 5.199999999999994, loss: (3.9601002, 0.014314313)
[24080] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, -0.024335844)
[24090] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, -0.019072678)
[24100] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601004, -0.005190866)
[24110] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601007, 0.038284626)
[24

[24930] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, 0.017057823)
[24940] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600992, 0.0038903435)
[24950] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.035259247)
[24960] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601002, -0.014300087)
[24970] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601, -0.028824454)
[24980] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.07340925)
[24990] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601004, 0.027944023)
[25000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, 0.0050844913)
[MoviePy] >>>> Building video movie/ac_25000.webm
[MoviePy] Writing video movie/ac_25000.webm


100%|██████████| 63/63 [00:00<00:00, 104.71it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_25000.webm 






[25010] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, 0.027147252)
[25020] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.011021221)
[25030] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600997, 0.01625642)
[25040] time live:42, cumulated reward: 3.200000000000001, loss: (3.9601, 0.040650934)
[25050] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, -0.015853917)
[25060] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.007640776)
[25070] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.017219938)
[25080] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, -0.05148711)
[25090] time live:74, cumulated reward: 7.39999999999999, loss: (3.9600992, -0.005339223)
[25100] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, -0.03962018)
[25110] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601004, 0.025947688)
[25120] tim

[25930] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.00026410085)
[25940] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.021722652)
[25950] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600995, -0.0115705915)
[25960] time live:40, cumulated reward: 3.0000000000000018, loss: (3.9601002, 0.022650171)
[25970] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, -0.0047634444)
[25980] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.014218909)
[25990] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601004, 0.018806925)
[26000] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, -0.010673505)
[MoviePy] >>>> Building video movie/ac_26000.webm
[MoviePy] Writing video movie/ac_26000.webm


 98%|█████████▊| 56/57 [00:00<00:00, 103.36it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_26000.webm 






[26010] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.017361311)
[26020] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, -0.04024844)
[26030] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.015826179)
[26040] time live:47, cumulated reward: 3.6999999999999993, loss: (3.960099, 0.02058859)
[26050] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601, -0.028461237)
[26060] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.016170979)
[26070] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601002, -0.0068438053)
[26080] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601002, -0.02373988)
[26090] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.031715345)
[26100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.036635883)
[26110] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601002, -0.01116485)
[26120] ti

[26930] time live:61, cumulated reward: 5.099999999999994, loss: (3.960101, -0.016067224)
[26940] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, 0.015212092)
[26950] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9600992, 0.022388214)
[26960] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601, -0.009630955)
[26970] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601004, -0.0041023004)
[26980] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.023099393)
[26990] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.031407263)
[27000] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601002, 0.042462137)
[MoviePy] >>>> Building video movie/ac_27000.webm
[MoviePy] Writing video movie/ac_27000.webm


 98%|█████████▊| 55/56 [00:00<00:00, 105.84it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_27000.webm 

[27010] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9601, 0.04560069)
[27020] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.0040040095)
[27030] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, -0.06418154)
[27040] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.016722748)
[27050] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600992, 0.0073979436)
[27060] time live:67, cumulated reward: 6.699999999999992, loss: (3.960099, -0.041829165)
[27070] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.017435042)
[27080] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.030542703)
[27090] time live:45, cumulated reward: 3.5, loss: (3.9600997, -0.06516429)
[27100] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.024368875)
[27110] time live:59, cumulated reward: 4.8999999

[27920] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, -0.06876102)
[27930] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601002, 0.0042583076)
[27940] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600995, -0.019592877)
[27950] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.007732092)
[27960] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.06768613)
[27970] time live:45, cumulated reward: 3.5, loss: (3.9601002, -0.03425111)
[27980] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601, -0.05545683)
[27990] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, -0.03630212)
[28000] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600997, -0.041811675)
[MoviePy] >>>> Building video movie/ac_28000.webm
[MoviePy] Writing video movie/ac_28000.webm


 98%|█████████▊| 58/59 [00:00<00:00, 94.87it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_28000.webm 






[28010] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.017674953)
[28020] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.051595252)
[28030] time live:68, cumulated reward: 6.799999999999992, loss: (3.9601, 0.023569345)
[28040] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.008413283)
[28050] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601002, -0.06425804)
[28060] time live:61, cumulated reward: 5.099999999999994, loss: (3.960101, 0.0024468782)
[28070] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601004, 0.04428377)
[28080] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.0150965)
[28090] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601002, 0.009273529)
[28100] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, -0.034838144)
[28110] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.0070964)
[28120] time 

[28930] time live:44, cumulated reward: 3.4000000000000004, loss: (3.9601002, 0.018946735)
[28940] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.013027504)
[28950] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.008840842)
[28960] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, -0.018386738)
[28970] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600992, -0.018921835)
[28980] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601, -0.025792783)
[28990] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, -0.039965298)
[29000] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601007, -0.0014623568)
[MoviePy] >>>> Building video movie/ac_29000.webm
[MoviePy] Writing video movie/ac_29000.webm


 98%|█████████▊| 52/53 [00:00<00:00, 105.93it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_29000.webm 






[29010] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.02496866)
[29020] time live:41, cumulated reward: 3.1000000000000014, loss: (3.9601007, -0.01854357)
[29030] time live:48, cumulated reward: 3.799999999999999, loss: (3.9600995, -0.013245225)
[29040] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.022911213)
[29050] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, 0.015972655)
[29060] time live:45, cumulated reward: 3.5, loss: (3.9600997, -0.018605232)
[29070] time live:72, cumulated reward: 7.19999999999999, loss: (3.9600997, 8.919504e-05)
[29080] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, -0.03912447)
[29090] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.039576452)
[29100] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, 0.0075675077)
[29110] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.024106886)
[29120] time live:43, 

[29930] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, -0.020904448)
[29940] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.032258518)
[29950] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, -0.024703203)
[29960] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601, -0.03423982)
[29970] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9600997, 0.09447226)
[29980] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.0024897153)
[29990] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.039155383)
[30000] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, 0.020440152)
[MoviePy] >>>> Building video movie/ac_30000.webm
[MoviePy] Writing video movie/ac_30000.webm


 98%|█████████▊| 58/59 [00:00<00:00, 99.10it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_30000.webm 






#### 最好的結果

In [283]:
from moviepy.editor import *
clip = VideoFileClip("movie/ac_8000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

 99%|█████████▊| 78/79 [00:00<00:00, 997.46it/s]


#### 進入第一個障礙物

In [284]:
from moviepy.editor import *
clip = VideoFileClip("movie/ac_3000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|██████████| 68/68 [00:00<00:00, 669.79it/s]


# Result

這次的作業我照著講義中的步驟來做，input的部份我用game.gamestate()來抓state，然後preprocessing的部份使用lab16的作法，也有依stack_num將他們stack起來，處理完後model的一開始加了一層dense layer，將state拉成6400 units，再將它reshape成和本來screen一樣的大小（80*80），結果以單純DQN、Policy gradient、Actor-Critic三個部份來做比較。

將input換成state之後，變得更難train，這次我三個部分都train了先30000個episode，train完才發現要訓練到完全收斂要非常長的時間，每1000次存一次影片。

結果的部分我分別放了三個部份只在30000個episode的訓練下最好的結果，以及三個部份能進到第一個障礙物的影片。

單純DQN的作法，在11000個episode時可以進入第一個障礙物，但無法完全通過，一直到22000 episode時可以完全通過第一個障礙物，但沒有辦法很穩定的越來越進步，訓練完30000個episode時最好可以完全通過兩個障礙物，但也不是很穩定。

Policy gradient 的部分，在2000episode和9000episode的時候就可以進入第一個障礙物，比單純DQN快，不過也沒辦法完全通過，在30000episode中還有幾次的結果也可以進入到第一個障礙物，但都沒有辦法完全通過，也都沒辦法穩定的進步，2000episode可以進入第一個障礙物，但一直到9000episode以前都沒辦法在進到第一個障礙物，相當不穩定，可能是跟policy gradient使用stochestic的policy以及容易找到local minimum有關，感覺要訓練到收斂要非常長的時間。

Actor-critic的部分，大概在3000個episode時可以進入到第一個障礙物，比單純DQN快，但也沒辦法完全通過，8000episode時就可以完全通過第一個障礙物，但後來的episode也不是很穩定，不是卡在第一個障礙物裡就是直接撞上第一個障礙物，感覺要收斂也是需要非常長的時間。