# Deep Q-Learning for Pong

In [1]:
import tensorflow as tf
import cv2
import sys
import pong_fun as game
import random
import time 
import numpy as np
from collections import deque

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


# Parameters for environment

In [2]:
ACTIONS = 6 # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 500. # timesteps to observe before training
EXPLORE = 500. # frames over which to anneal epsilon
FINAL_EPSILON =  0.05 # final value of epsilon
INITIAL_EPSILON = 1 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH = 32 # size of minibatch

# Define the DON agent

In [3]:
np.random.seed(0)
tf.set_random_seed(0)
# Deep Q Network off-policy
class DeepQNetwork:
    def __init__(
            self,
            ACTIONS = 6,
            GAMMA = 0.99,
            OBSERVE = 500., # timesteps to observe before training
            EXPLORE = 500., # frames over which to anneal epsilon
            FINAL_EPSILON =  0.05, # final value of epsilon
            INITIAL_EPSILON = 1, # starting value of epsilon
            REPLAY_MEMORY = 50000,# number of previous transitions to remember
            BATCH = 32, # size of minibatch
            input_size = [84,84,4],
            learning_rate = 1e-6,
            replace_target_iter=200,
    ):
        tf.reset_default_graph()
        self.n_actions = ACTIONS
        self.gamma = GAMMA
        self.observe = OBSERVE  # timesteps to observe before training
        self.explore = EXPLORE # frames over which to anneal epsilon
        self.final_epsilon = FINAL_EPSILON  # final value of epsilon
        self.initial_epsilon = INITIAL_EPSILON  # starting value of epsilon
        self.memory_size = REPLAY_MEMORY # number of previous transitions to remember
        self.batch_size = BATCH # size of minibatch
        self.input_size =input_size 
        self.lr = learning_rate
        self.replace_target_iter = replace_target_iter

        # total learning step
        self.time_step = 0       
        self.epsilon = INITIAL_EPSILON
        #initialize zero memory [s, a, r, s_]
        self.memory = deque()

        # consist of [target_net, evaluate_net]
        self._build_net()

        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

        with tf.variable_scope('soft_replacement'):
            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        self.sess.run(tf.global_variables_initializer())
        print("---------- Class for the Agent is Ready ------------")
        
    # Building the Deep Q-Network
    def _build_net(self):
        # ------------------ all inputs ------------------------
        self.s = tf.placeholder(tf.float32,[None,self.input_size[0],self.input_size[1],self.input_size[2]],name='s') 
        self.s_next = tf.placeholder(tf.float32,[None,self.input_size[0],self.input_size[1],self.input_size[2]],name='s_')
        self.r = tf.placeholder(tf.float32, [None, ], name='r')  # input Reward
        self.a = tf.placeholder(tf.float32, [None, ACTIONS], name='a')
        self.y = tf.placeholder(tf.float32, [None])

        # Gausian weight initializer
        w_initializer, b_initializer = tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01)
        
        # ------------------ build evaluate_net ------------------
        with tf.variable_scope('eval_net'):
            # Convolutional Layer 1
            with tf.variable_scope('conv1'): # hidden layer -1
                # 5x5 conv, 1 input, 32 outputs
                w1 = tf.get_variable('w1',[8, 8, self.input_size[2], 32],initializer=w_initializer)
                b1 = tf.get_variable('b1',[32],initializer=b_initializer)
                # Convolution Layer
                l1 = tf.nn.conv2d(self.s, w1, strides=[1, 4, 4, 1], padding="VALID")
                conv1 = tf.nn.relu(tf.nn.bias_add(l1, b1) )
                #print(conv1.get_shape())
            with tf.variable_scope('conv2'): # hidden layer -2
                # 5x5 conv, 1 input, 32 outputs
                w2 = tf.get_variable('w2',[4, 4, 32, 64],initializer=w_initializer)
                b2 = tf.get_variable('b2',[64],initializer=b_initializer)
                # Convolution Layer
                l2 = tf.nn.conv2d(conv1, w2, strides=[1, 2, 2, 1], padding="VALID")
                conv2 = tf.nn.relu(tf.nn.bias_add(l2, b2))
                #print(conv2.get_shape())
            with tf.variable_scope('conv3'): # hidden layer -3
                # 5x5 conv, 1 input, 32 outputs
                w3 = tf.get_variable('w3',[3, 3, 64, 64],initializer=w_initializer)
                b3 = tf.get_variable('b3',[64],initializer=b_initializer)
                # Convolution Layer
                l3 = tf.nn.conv2d(conv2, w3, strides=[1, 1, 1, 1], padding="VALID")
                conv3 = tf.nn.relu(tf.nn.bias_add(l3, b3) )
                #print(conv3.get_shape())
                # Reshape conv3 output to fit fully connected layer input
                fc1_input = tf.reshape(conv3, [-1, 7*7*64])
                #print(fc1_input.get_shape())
            with tf.variable_scope('fully_connected_layer'):
                # Fully connected layer
                w4 = tf.get_variable('w4',[7*7*64,512],initializer=w_initializer)
                b4 = tf.get_variable('b4',[1, 512],initializer=b_initializer)
                self.fc_out = tf.nn.relu(tf.matmul(fc1_input, w4) + b4)
                #print(self.fc_out.get_shape())
            with tf.variable_scope("output_layer___1"):
                w5 = tf.get_variable('w5',[512, self.n_actions],initializer=w_initializer)
                b5 = tf.get_variable('b5',[1, self.n_actions],initializer=b_initializer)
                #print(w5.get_shape(),"W5shape")
                #print(b5.get_shape(),"W5shape")
                self.q_eval = tf.matmul(self.fc_out, w5) + b5
                #print(self.q_eval)
                print("Building of the Evaluation network is Done _Dola_Ram")
                
        # ------------------ build target_net ------------------
        with tf.variable_scope('target_net'):
            # Convolutional Layer 1
            with tf.variable_scope('conv1'): # hidden layer -1
                # 5x5 conv, 1 input, 32 outputs
                w1 = tf.get_variable('w1',[8, 8, self.input_size[2], 32],initializer=w_initializer)
                b1 = tf.get_variable('b1',[32],initializer=b_initializer)
                # Convolution Layer
                l1 = tf.nn.conv2d(self.s_next, w1, strides=[1, 4, 4, 1], padding="VALID")
                conv1 = tf.nn.relu(tf.nn.bias_add(l1, b1) )
                #print(conv1.get_shape())
            with tf.variable_scope('conv2'): # hidden layer -2
                # 5x5 conv, 1 input, 32 outputs
                w2 = tf.get_variable('w2',[4, 4, 32, 64],initializer=w_initializer)
                b2 = tf.get_variable('b2',[64],initializer=b_initializer)
                # Convolution Layer
                l2 = tf.nn.conv2d(conv1, w2, strides=[1, 2, 2, 1], padding="VALID")
                conv2 = tf.nn.relu(tf.nn.bias_add(l2, b2))
                #print(conv2.get_shape())
            with tf.variable_scope('conv3'): # hidden layer -3
                # 5x5 conv, 1 input, 32 outputs
                w3 = tf.get_variable('w3',[3, 3, 64, 64],initializer=w_initializer)
                b3 = tf.get_variable('b3',[64],initializer=b_initializer)
                # Convolution Layer
                l3 = tf.nn.conv2d(conv2, w3, strides=[1, 1, 1, 1], padding="VALID")
                conv3 = tf.nn.relu(tf.nn.bias_add(l3, b3) )
                #print(conv3.get_shape())
                # Reshape conv3 output to fit fully connected layer input
                fc1_input = tf.reshape(conv3, [-1, 7*7*64])
                #print(fc1_input.get_shape())
            with tf.variable_scope('fully_connected_layer'):
                # Fully connected layer
                w4 = tf.get_variable('w4',[7*7*64,512],initializer=w_initializer)
                b4 = tf.get_variable('b4',[1, 512],initializer=b_initializer)
                self.fc_out = tf.nn.relu(tf.matmul(fc1_input, w4) + b4)
                #print(self.fc_out.get_shape())
            with tf.variable_scope("output_layer___1"):
                w5 = tf.get_variable('w5',[512, self.n_actions],initializer=w_initializer)
                b5 = tf.get_variable('b5',[1, self.n_actions],initializer=b_initializer)
                #print(w5.get_shape(),"W5shape")
                #print(b5.get_shape(),"W5shape")
                self.q_next = tf.matmul(self.fc_out, w5) + b5
                print("Building of the Target network is Done _Dola_Ram")
                
        # ---------- for loss and chossing actions ---------------
        with tf.variable_scope('q_eval'):
            # current value of the Q 
            self.q_eval_wrt_a = tf.reduce_sum(tf.multiply(self.q_eval, self.a), reduction_indices = 1)
            
        with tf.variable_scope('loss'):
            #  Mean squared loss mean((q_target-q_eval_wrt_a)^2)
            self.cost = tf.reduce_mean(tf.square(self.y - self.q_eval_wrt_a))
 
        with tf.variable_scope('train'):
            self.train_step = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
            
    #--------- Choosing action based for the state(observation)----------------------------
    def choose_action(self, s_t):
        self.time_step += 1
         # choose an action epsilon greedily
        q_eval_t = self.sess.run(self.q_eval, feed_dict={self.s: s_t})
        # readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([self.n_actions])
        if random.random() <= self.epsilon or self.time_step <= self.observe:
            action_index = random.randrange(self.n_actions)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(q_eval_t)
            a_t[action_index] = 1

        # scale down epsilon
        if self.epsilon > self.final_epsilon and self.time_step > self.observe :
            self.epsilon -= (self.initial_epsilon - self.final_epsilon) / self.explore

        return a_t, self.epsilon, q_eval_t
    
    # ----------- learn the optimal polocy 
    def learn(self):
        # sample batch memory from all memory
        # only train if done observing
        if (self.time_step > self.observe) :
            # check to replace target parameters
            if self.time_step % self.replace_target_iter == 0:
                self.sess.run(self.target_replace_op)
                # print('\n target_params_replaced \n')
            # sample a minibatch to train on
            minibatch = random.sample(self.memory, self.batch_size)
            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]
            y_batch = []
            #readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            readout_j1_batch = self.sess.run(self.q_next, feed_dict = {self.s_next : s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + self.gamma * np.max(readout_j1_batch[i]))
            # perform gradient step
            '''train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch})'''
            _,q_eval = self.sess.run([self.train_step,self.q_eval], feed_dict = {
                self.y : y_batch,
                self.a : a_batch,
                self.s : s_j_batch})
        return q_eval
    
    def store_transition(self, s_t, a_t, r_t, s_t1, terminal):
        # store the transition in replay memory
        self.memory.append((s_t, a_t, r_t, s_t1, terminal))
        if len(self.memory) > REPLAY_MEMORY:
            self.memory.popleft()
            

# Trainig the Agent by playing with environment

In [4]:
RL = DeepQNetwork()
# open up a game state to communicate with emulator
game_state = game.GameState()
# get the first state by doing nothing and preprocess the image to 80x80x4
do_nothing = np.zeros(ACTIONS)
do_nothing[0] = 1
x_t, r_0, terminal, bar1_score, bar2_score = game_state.frame_step(do_nothing)
x_t = cv2.cvtColor(cv2.resize(x_t, (84, 84)), cv2.COLOR_BGR2GRAY)
ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)
t = 0
tick = time.time()
while True:
    # choose an action epsilon greedily
    a_t, epsilon, q_eval_t = RL.choose_action(s_t.reshape(1,84, 84, 4))
    # run the selected action and observe next state and reward
    x_t1_col, r_t, terminal, bar1_score, bar2_score = game_state.frame_step(a_t)
    x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (84, 84)), cv2.COLOR_BGR2GRAY)
    ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
    x_t1 = np.reshape(x_t1, (84, 84, 1))
    s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2)
    
    RL.store_transition(s_t, a_t, r_t, s_t1, terminal)
    
    # only train if done observing
    if (t > OBSERVE) :
        q_eval = RL.learn()
            
    # update the old values
    s_t = s_t1
    t += 1
    if r_t!= 0:
        print ("TIMESTEP", t, "/ e",round(epsilon, 3) , "/ Agent_score", bar1_score, "/ bar2_score",bar2_score,\
               "/reward", r_t, "/ Q_max %e" % np.max(q_eval_t))
    if(bar1_score - bar2_score > 18): 
        print("Game_Ends_in Time:",int(time.time() - tick))
        break;

Building of the Evaluation network is Done _Dola_Ram
Building of the Target network is Done _Dola_Ram
---------- Class for the Agent is Ready ------------
TIMESTEP 129 / e 1 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max 1.000920e-02
TIMESTEP 175 / e 1 / Agent_score 0 / bar2_score 2 /reward -1 / Q_max 1.444070e-02
TIMESTEP 221 / e 1 / Agent_score 0 / bar2_score 3 /reward -1 / Q_max 1.126540e-02
TIMESTEP 267 / e 1 / Agent_score 0 / bar2_score 4 /reward -1 / Q_max 9.906007e-03
TIMESTEP 313 / e 1 / Agent_score 0 / bar2_score 5 /reward -1 / Q_max 1.034196e-02
TIMESTEP 443 / e 1 / Agent_score 1 / bar2_score 5 /reward 1 / Q_max 1.206354e-02
TIMESTEP 572 / e 0.863 / Agent_score 1 / bar2_score 6 /reward -1 / Q_max 1.193446e-02
TIMESTEP 618 / e 0.776 / Agent_score 1 / bar2_score 7 /reward -1 / Q_max 9.211323e-03
TIMESTEP 664 / e 0.688 / Agent_score 1 / bar2_score 8 /reward -1 / Q_max 8.338717e-03
TIMESTEP 710 / e 0.601 / Agent_score 1 / bar2_score 9 /reward -1 / Q_max 8.879194e-03
TIMESTEP 7

TIMESTEP 5407 / e 0.05 / Agent_score 0 / bar2_score 8 /reward -1 / Q_max -1.203814e-01
TIMESTEP 5453 / e 0.05 / Agent_score 0 / bar2_score 9 /reward -1 / Q_max -1.338299e-01
TIMESTEP 5584 / e 0.05 / Agent_score 1 / bar2_score 9 /reward 1 / Q_max -1.284733e-01
TIMESTEP 5713 / e 0.05 / Agent_score 1 / bar2_score 10 /reward -1 / Q_max -1.009747e-01
TIMESTEP 5759 / e 0.05 / Agent_score 1 / bar2_score 11 /reward -1 / Q_max -9.495607e-02
TIMESTEP 5805 / e 0.05 / Agent_score 1 / bar2_score 12 /reward -1 / Q_max -9.607455e-02
TIMESTEP 5851 / e 0.05 / Agent_score 1 / bar2_score 13 /reward -1 / Q_max -1.105399e-01
TIMESTEP 5897 / e 0.05 / Agent_score 1 / bar2_score 14 /reward -1 / Q_max -1.048823e-01
TIMESTEP 5943 / e 0.05 / Agent_score 1 / bar2_score 15 /reward -1 / Q_max -1.078494e-01
TIMESTEP 5989 / e 0.05 / Agent_score 1 / bar2_score 16 /reward -1 / Q_max -1.073801e-01
TIMESTEP 6118 / e 0.05 / Agent_score 2 / bar2_score 16 /reward 1 / Q_max -1.015213e-01
TIMESTEP 6247 / e 0.05 / Agent_score 

TIMESTEP 12068 / e 0.05 / Agent_score 0 / bar2_score 10 /reward -1 / Q_max -3.207856e-01
TIMESTEP 12114 / e 0.05 / Agent_score 0 / bar2_score 11 /reward -1 / Q_max -3.163238e-01
TIMESTEP 12160 / e 0.05 / Agent_score 0 / bar2_score 12 /reward -1 / Q_max -3.328490e-01
TIMESTEP 12206 / e 0.05 / Agent_score 0 / bar2_score 13 /reward -1 / Q_max -3.137125e-01
TIMESTEP 12252 / e 0.05 / Agent_score 0 / bar2_score 14 /reward -1 / Q_max -3.245226e-01
TIMESTEP 12298 / e 0.05 / Agent_score 0 / bar2_score 15 /reward -1 / Q_max -3.244805e-01
TIMESTEP 12344 / e 0.05 / Agent_score 0 / bar2_score 16 /reward -1 / Q_max -3.298649e-01
TIMESTEP 12390 / e 0.05 / Agent_score 0 / bar2_score 17 /reward -1 / Q_max -3.104714e-01
TIMESTEP 12436 / e 0.05 / Agent_score 0 / bar2_score 18 /reward -1 / Q_max -3.309621e-01
TIMESTEP 12565 / e 0.05 / Agent_score 1 / bar2_score 18 /reward 1 / Q_max -2.591646e-01
TIMESTEP 12694 / e 0.05 / Agent_score 1 / bar2_score 19 /reward -1 / Q_max -2.484440e-01
TIMESTEP 12740 / e 0.0

TIMESTEP 19513 / e 0.05 / Agent_score 1 / bar2_score 5 /reward -1 / Q_max -1.137809e-01
TIMESTEP 19559 / e 0.05 / Agent_score 1 / bar2_score 6 /reward -1 / Q_max -2.161792e-01
TIMESTEP 19605 / e 0.05 / Agent_score 1 / bar2_score 7 /reward -1 / Q_max -1.344917e-01
TIMESTEP 19651 / e 0.05 / Agent_score 1 / bar2_score 8 /reward -1 / Q_max -1.924247e-01
TIMESTEP 19697 / e 0.05 / Agent_score 1 / bar2_score 9 /reward -1 / Q_max -1.803094e-01
TIMESTEP 19743 / e 0.05 / Agent_score 1 / bar2_score 10 /reward -1 / Q_max -2.306733e-01
TIMESTEP 19789 / e 0.05 / Agent_score 1 / bar2_score 11 /reward -1 / Q_max -2.014295e-01
TIMESTEP 19835 / e 0.05 / Agent_score 1 / bar2_score 12 /reward -1 / Q_max -2.070021e-01
TIMESTEP 19881 / e 0.05 / Agent_score 1 / bar2_score 13 /reward -1 / Q_max -2.640725e-01
TIMESTEP 19927 / e 0.05 / Agent_score 1 / bar2_score 14 /reward -1 / Q_max -1.894532e-01
TIMESTEP 19973 / e 0.05 / Agent_score 1 / bar2_score 15 /reward -1 / Q_max -3.489287e-01
TIMESTEP 20019 / e 0.05 / 

TIMESTEP 24973 / e 0.05 / Agent_score 1 / bar2_score 15 /reward -1 / Q_max -1.835733e-01
TIMESTEP 25019 / e 0.05 / Agent_score 1 / bar2_score 16 /reward -1 / Q_max -3.033331e-01
TIMESTEP 25065 / e 0.05 / Agent_score 1 / bar2_score 17 /reward -1 / Q_max -1.747872e-01
TIMESTEP 25111 / e 0.05 / Agent_score 1 / bar2_score 18 /reward -1 / Q_max -3.377889e-01
TIMESTEP 25157 / e 0.05 / Agent_score 1 / bar2_score 19 /reward -1 / Q_max -3.290761e-01
TIMESTEP 25203 / e 0.05 / Agent_score 0 / bar2_score 0 /reward -1 / Q_max -4.224736e-01
TIMESTEP 25249 / e 0.05 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max -8.349987e-02
TIMESTEP 25295 / e 0.05 / Agent_score 0 / bar2_score 2 /reward -1 / Q_max -4.963767e-01
TIMESTEP 25341 / e 0.05 / Agent_score 0 / bar2_score 3 /reward -1 / Q_max -1.725494e-01
TIMESTEP 25387 / e 0.05 / Agent_score 0 / bar2_score 4 /reward -1 / Q_max -3.048983e-01
TIMESTEP 25433 / e 0.05 / Agent_score 0 / bar2_score 5 /reward -1 / Q_max -2.141088e-01
TIMESTEP 25479 / e 0.05 / A

TIMESTEP 31270 / e 0.05 / Agent_score 0 / bar2_score 4 /reward -1 / Q_max -2.739826e-01
TIMESTEP 31316 / e 0.05 / Agent_score 0 / bar2_score 5 /reward -1 / Q_max -6.378633e-01
TIMESTEP 31362 / e 0.05 / Agent_score 0 / bar2_score 6 /reward -1 / Q_max -2.850676e-01
TIMESTEP 31408 / e 0.05 / Agent_score 0 / bar2_score 7 /reward -1 / Q_max -6.799243e-01
TIMESTEP 31454 / e 0.05 / Agent_score 0 / bar2_score 8 /reward -1 / Q_max -3.533559e-01
TIMESTEP 31500 / e 0.05 / Agent_score 0 / bar2_score 9 /reward -1 / Q_max -6.561002e-01
TIMESTEP 31546 / e 0.05 / Agent_score 0 / bar2_score 10 /reward -1 / Q_max -3.251054e-01
TIMESTEP 31592 / e 0.05 / Agent_score 0 / bar2_score 11 /reward -1 / Q_max -6.095849e-01
TIMESTEP 31638 / e 0.05 / Agent_score 0 / bar2_score 12 /reward -1 / Q_max -4.684647e-01
TIMESTEP 31684 / e 0.05 / Agent_score 0 / bar2_score 13 /reward -1 / Q_max -6.956145e-01
TIMESTEP 31730 / e 0.05 / Agent_score 0 / bar2_score 14 /reward -1 / Q_max -5.465092e-01
TIMESTEP 31776 / e 0.05 / A

TIMESTEP 37394 / e 0.05 / Agent_score 0 / bar2_score 10 /reward -1 / Q_max -1.032698e+00
TIMESTEP 37440 / e 0.05 / Agent_score 0 / bar2_score 11 /reward -1 / Q_max -6.813637e-01
TIMESTEP 37486 / e 0.05 / Agent_score 0 / bar2_score 12 /reward -1 / Q_max -1.181959e+00
TIMESTEP 37532 / e 0.05 / Agent_score 0 / bar2_score 13 /reward -1 / Q_max -5.513628e-01
TIMESTEP 37578 / e 0.05 / Agent_score 0 / bar2_score 14 /reward -1 / Q_max -1.175695e+00
TIMESTEP 37624 / e 0.05 / Agent_score 0 / bar2_score 15 /reward -1 / Q_max -7.564521e-01
TIMESTEP 37670 / e 0.05 / Agent_score 0 / bar2_score 16 /reward -1 / Q_max -1.112310e+00
TIMESTEP 37716 / e 0.05 / Agent_score 0 / bar2_score 17 /reward -1 / Q_max -6.272386e-01
TIMESTEP 37762 / e 0.05 / Agent_score 0 / bar2_score 18 /reward -1 / Q_max -1.155878e+00
TIMESTEP 37893 / e 0.05 / Agent_score 1 / bar2_score 18 /reward 1 / Q_max 3.176709e-01
TIMESTEP 38022 / e 0.05 / Agent_score 1 / bar2_score 19 /reward -1 / Q_max -1.250706e-01
TIMESTEP 38068 / e 0.05

TIMESTEP 42348 / e 0.05 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max -1.077037e+00
TIMESTEP 42394 / e 0.05 / Agent_score 0 / bar2_score 2 /reward -1 / Q_max -1.029859e+00
TIMESTEP 42440 / e 0.05 / Agent_score 0 / bar2_score 3 /reward -1 / Q_max -1.118694e+00
TIMESTEP 42486 / e 0.05 / Agent_score 0 / bar2_score 4 /reward -1 / Q_max -8.261477e-01
TIMESTEP 42702 / e 0.05 / Agent_score 0 / bar2_score 5 /reward -1 / Q_max -3.089266e-01
TIMESTEP 42748 / e 0.05 / Agent_score 0 / bar2_score 6 /reward -1 / Q_max -6.615102e-01
TIMESTEP 42794 / e 0.05 / Agent_score 0 / bar2_score 7 /reward -1 / Q_max -9.761305e-01
TIMESTEP 42840 / e 0.05 / Agent_score 0 / bar2_score 8 /reward -1 / Q_max -5.609281e-01
TIMESTEP 42886 / e 0.05 / Agent_score 0 / bar2_score 9 /reward -1 / Q_max -8.982022e-01
TIMESTEP 42932 / e 0.05 / Agent_score 0 / bar2_score 10 /reward -1 / Q_max -8.113582e-01
TIMESTEP 42978 / e 0.05 / Agent_score 0 / bar2_score 11 /reward -1 / Q_max -1.052505e+00
TIMESTEP 43024 / e 0.05 / Agen

TIMESTEP 48633 / e 0.05 / Agent_score 2 / bar2_score 4 /reward -1 / Q_max -5.882078e-01
TIMESTEP 48679 / e 0.05 / Agent_score 2 / bar2_score 5 /reward -1 / Q_max -6.762708e-01
TIMESTEP 48725 / e 0.05 / Agent_score 2 / bar2_score 6 /reward -1 / Q_max -7.025185e-01
TIMESTEP 48771 / e 0.05 / Agent_score 2 / bar2_score 7 /reward -1 / Q_max -6.144991e-01
TIMESTEP 48817 / e 0.05 / Agent_score 2 / bar2_score 8 /reward -1 / Q_max -8.050451e-01
TIMESTEP 48863 / e 0.05 / Agent_score 2 / bar2_score 9 /reward -1 / Q_max -7.504292e-01
TIMESTEP 48992 / e 0.05 / Agent_score 3 / bar2_score 9 /reward 1 / Q_max 9.680071e-01
TIMESTEP 49121 / e 0.05 / Agent_score 3 / bar2_score 10 /reward -1 / Q_max 1.215398e-01
TIMESTEP 49167 / e 0.05 / Agent_score 3 / bar2_score 11 /reward -1 / Q_max -1.651888e-01
TIMESTEP 49213 / e 0.05 / Agent_score 3 / bar2_score 12 /reward -1 / Q_max -7.565029e-01
TIMESTEP 49259 / e 0.05 / Agent_score 3 / bar2_score 13 /reward -1 / Q_max -6.748058e-01
TIMESTEP 49305 / e 0.05 / Agent

TIMESTEP 56457 / e 0.05 / Agent_score 8 / bar2_score 19 /reward -1 / Q_max 3.157627e-01
TIMESTEP 56503 / e 0.05 / Agent_score 0 / bar2_score 0 /reward -1 / Q_max -8.246157e-02
TIMESTEP 56549 / e 0.05 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max -4.691329e-01
TIMESTEP 56595 / e 0.05 / Agent_score 0 / bar2_score 2 /reward -1 / Q_max -5.721096e-01
TIMESTEP 56724 / e 0.05 / Agent_score 1 / bar2_score 2 /reward 1 / Q_max 1.329244e+00
TIMESTEP 56853 / e 0.05 / Agent_score 1 / bar2_score 3 /reward -1 / Q_max 2.196211e-01
TIMESTEP 56982 / e 0.05 / Agent_score 2 / bar2_score 3 /reward 1 / Q_max 1.581147e+00
TIMESTEP 57111 / e 0.05 / Agent_score 2 / bar2_score 4 /reward -1 / Q_max 1.320098e-01
TIMESTEP 57242 / e 0.05 / Agent_score 3 / bar2_score 4 /reward 1 / Q_max 9.963456e-01
TIMESTEP 57371 / e 0.05 / Agent_score 3 / bar2_score 5 /reward -1 / Q_max 1.277133e-01
TIMESTEP 57417 / e 0.05 / Agent_score 3 / bar2_score 6 /reward -1 / Q_max 2.480666e-01
TIMESTEP 57463 / e 0.05 / Agent_score 3 / 

TIMESTEP 65945 / e 0.05 / Agent_score 2 / bar2_score 5 /reward -1 / Q_max -2.263055e-01
TIMESTEP 65991 / e 0.05 / Agent_score 2 / bar2_score 6 /reward -1 / Q_max 4.299182e-01
TIMESTEP 66037 / e 0.05 / Agent_score 2 / bar2_score 7 /reward -1 / Q_max -3.925439e-02
TIMESTEP 66167 / e 0.05 / Agent_score 3 / bar2_score 7 /reward 1 / Q_max 1.129209e+00
TIMESTEP 66296 / e 0.05 / Agent_score 3 / bar2_score 8 /reward -1 / Q_max -1.845510e-01
TIMESTEP 66342 / e 0.05 / Agent_score 3 / bar2_score 9 /reward -1 / Q_max 3.889720e-01
TIMESTEP 66388 / e 0.05 / Agent_score 3 / bar2_score 10 /reward -1 / Q_max 2.952285e-02
TIMESTEP 66434 / e 0.05 / Agent_score 3 / bar2_score 11 /reward -1 / Q_max -4.378226e-01
TIMESTEP 66480 / e 0.05 / Agent_score 3 / bar2_score 12 /reward -1 / Q_max -1.471216e-01
TIMESTEP 66526 / e 0.05 / Agent_score 3 / bar2_score 13 /reward -1 / Q_max -1.003834e-01
TIMESTEP 66572 / e 0.05 / Agent_score 3 / bar2_score 14 /reward -1 / Q_max -2.290557e-01
TIMESTEP 66701 / e 0.05 / Agent_

TIMESTEP 74851 / e 0.05 / Agent_score 3 / bar2_score 14 /reward 1 / Q_max 1.681492e+00
TIMESTEP 74980 / e 0.05 / Agent_score 3 / bar2_score 15 /reward -1 / Q_max -5.205078e-01
TIMESTEP 75026 / e 0.05 / Agent_score 3 / bar2_score 16 /reward -1 / Q_max 1.710579e-02
TIMESTEP 75072 / e 0.05 / Agent_score 3 / bar2_score 17 /reward -1 / Q_max -2.209115e-01
TIMESTEP 75204 / e 0.05 / Agent_score 4 / bar2_score 17 /reward 1 / Q_max 1.040880e+00
TIMESTEP 75333 / e 0.05 / Agent_score 4 / bar2_score 18 /reward -1 / Q_max -2.648706e-01
TIMESTEP 75379 / e 0.05 / Agent_score 4 / bar2_score 19 /reward -1 / Q_max 4.933261e-01
TIMESTEP 75592 / e 0.05 / Agent_score 0 / bar2_score 0 /reward -1 / Q_max 2.946561e-01
TIMESTEP 75638 / e 0.05 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max -3.406262e-01
TIMESTEP 75684 / e 0.05 / Agent_score 0 / bar2_score 2 /reward -1 / Q_max -4.210626e-01
TIMESTEP 75813 / e 0.05 / Agent_score 1 / bar2_score 2 /reward 1 / Q_max 1.808447e+00
TIMESTEP 75942 / e 0.05 / Agent_sc

TIMESTEP 84412 / e 0.05 / Agent_score 10 / bar2_score 19 /reward -1 / Q_max -6.772198e-01
TIMESTEP 84458 / e 0.05 / Agent_score 0 / bar2_score 0 /reward -1 / Q_max -7.153870e-01
TIMESTEP 84589 / e 0.05 / Agent_score 1 / bar2_score 0 /reward 1 / Q_max 1.480276e+00
TIMESTEP 84718 / e 0.05 / Agent_score 1 / bar2_score 1 /reward -1 / Q_max -1.536987e-01
TIMESTEP 84847 / e 0.05 / Agent_score 2 / bar2_score 1 /reward 1 / Q_max 1.972880e+00
TIMESTEP 84976 / e 0.05 / Agent_score 2 / bar2_score 2 /reward -1 / Q_max -5.457131e-01
TIMESTEP 85105 / e 0.05 / Agent_score 3 / bar2_score 2 /reward 1 / Q_max 2.148997e+00
TIMESTEP 85234 / e 0.05 / Agent_score 3 / bar2_score 3 /reward -1 / Q_max -6.566997e-01
TIMESTEP 85363 / e 0.05 / Agent_score 4 / bar2_score 3 /reward 1 / Q_max 1.971979e+00
TIMESTEP 85492 / e 0.05 / Agent_score 4 / bar2_score 4 /reward -1 / Q_max -3.894031e-01
TIMESTEP 85621 / e 0.05 / Agent_score 5 / bar2_score 4 /reward 1 / Q_max 1.932473e+00
TIMESTEP 85750 / e 0.05 / Agent_score 5 

TIMESTEP 95308 / e 0.05 / Agent_score 12 / bar2_score 13 /reward 1 / Q_max 1.879825e+00
TIMESTEP 95437 / e 0.05 / Agent_score 12 / bar2_score 14 /reward -1 / Q_max -4.107215e-01
TIMESTEP 95566 / e 0.05 / Agent_score 13 / bar2_score 14 /reward 1 / Q_max 1.613766e+00
TIMESTEP 95695 / e 0.05 / Agent_score 13 / bar2_score 15 /reward -1 / Q_max -9.079273e-01
TIMESTEP 95741 / e 0.05 / Agent_score 13 / bar2_score 16 /reward -1 / Q_max -7.260762e-01
TIMESTEP 95787 / e 0.05 / Agent_score 13 / bar2_score 17 /reward -1 / Q_max -4.832457e-01
TIMESTEP 95916 / e 0.05 / Agent_score 14 / bar2_score 17 /reward 1 / Q_max 1.775807e+00
TIMESTEP 96045 / e 0.05 / Agent_score 14 / bar2_score 18 /reward -1 / Q_max -3.343923e-01
TIMESTEP 96174 / e 0.05 / Agent_score 15 / bar2_score 18 /reward 1 / Q_max 1.951383e+00
TIMESTEP 96303 / e 0.05 / Agent_score 15 / bar2_score 19 /reward -1 / Q_max -3.795527e-01
TIMESTEP 96432 / e 0.05 / Agent_score 16 / bar2_score 19 /reward 1 / Q_max 1.795214e+00
TIMESTEP 96561 / e 0

TIMESTEP 106319 / e 0.05 / Agent_score 6 / bar2_score 6 /reward -1 / Q_max -1.544377e-01
TIMESTEP 106450 / e 0.05 / Agent_score 7 / bar2_score 6 /reward 1 / Q_max 1.843225e+00
TIMESTEP 106579 / e 0.05 / Agent_score 7 / bar2_score 7 /reward -1 / Q_max -1.073043e-01
TIMESTEP 106708 / e 0.05 / Agent_score 8 / bar2_score 7 /reward 1 / Q_max 2.242076e+00
TIMESTEP 106837 / e 0.05 / Agent_score 8 / bar2_score 8 /reward -1 / Q_max -1.069733e-01
TIMESTEP 106966 / e 0.05 / Agent_score 9 / bar2_score 8 /reward 1 / Q_max 2.182819e+00
TIMESTEP 107095 / e 0.05 / Agent_score 9 / bar2_score 9 /reward -1 / Q_max -2.062116e-01
TIMESTEP 107225 / e 0.05 / Agent_score 10 / bar2_score 9 /reward 1 / Q_max 1.924087e+00
TIMESTEP 107354 / e 0.05 / Agent_score 10 / bar2_score 10 /reward -1 / Q_max -1.889418e-01
TIMESTEP 107483 / e 0.05 / Agent_score 11 / bar2_score 10 /reward 1 / Q_max 2.068300e+00
TIMESTEP 107612 / e 0.05 / Agent_score 11 / bar2_score 11 /reward -1 / Q_max -2.373076e-01
TIMESTEP 107741 / e 0.05

TIMESTEP 117418 / e 0.05 / Agent_score 15 / bar2_score 18 /reward -1 / Q_max -5.778929e-01
TIMESTEP 117464 / e 0.05 / Agent_score 15 / bar2_score 19 /reward -1 / Q_max -6.141624e-01
TIMESTEP 117593 / e 0.05 / Agent_score 16 / bar2_score 19 /reward 1 / Q_max 1.918370e+00
TIMESTEP 117722 / e 0.05 / Agent_score 0 / bar2_score 0 /reward -1 / Q_max -3.764207e-01
TIMESTEP 117851 / e 0.05 / Agent_score 1 / bar2_score 0 /reward 1 / Q_max 2.183607e+00
TIMESTEP 117980 / e 0.05 / Agent_score 1 / bar2_score 1 /reward -1 / Q_max -2.290487e-01
TIMESTEP 118109 / e 0.05 / Agent_score 2 / bar2_score 1 /reward 1 / Q_max 2.117594e+00
TIMESTEP 118238 / e 0.05 / Agent_score 2 / bar2_score 2 /reward -1 / Q_max -1.236838e-01
TIMESTEP 118367 / e 0.05 / Agent_score 3 / bar2_score 2 /reward 1 / Q_max 2.191201e+00
TIMESTEP 118496 / e 0.05 / Agent_score 3 / bar2_score 3 /reward -1 / Q_max -2.057054e-01
TIMESTEP 118625 / e 0.05 / Agent_score 4 / bar2_score 3 /reward 1 / Q_max 2.057275e+00
TIMESTEP 118754 / e 0.05 

TIMESTEP 129352 / e 0.05 / Agent_score 6 / bar2_score 6 /reward -1 / Q_max -1.051234e-01
TIMESTEP 129482 / e 0.05 / Agent_score 7 / bar2_score 6 /reward 1 / Q_max 1.724409e+00
TIMESTEP 129611 / e 0.05 / Agent_score 7 / bar2_score 7 /reward -1 / Q_max -1.467677e-01
TIMESTEP 129740 / e 0.05 / Agent_score 8 / bar2_score 7 /reward 1 / Q_max 2.080734e+00
TIMESTEP 129869 / e 0.05 / Agent_score 8 / bar2_score 8 /reward -1 / Q_max -5.696573e-02
TIMESTEP 129998 / e 0.05 / Agent_score 9 / bar2_score 8 /reward 1 / Q_max 2.019067e+00
TIMESTEP 130127 / e 0.05 / Agent_score 9 / bar2_score 9 /reward -1 / Q_max -1.588348e-01
TIMESTEP 130256 / e 0.05 / Agent_score 10 / bar2_score 9 /reward 1 / Q_max 1.948339e+00
TIMESTEP 130385 / e 0.05 / Agent_score 10 / bar2_score 10 /reward -1 / Q_max -1.753244e-01
TIMESTEP 130514 / e 0.05 / Agent_score 11 / bar2_score 10 /reward 1 / Q_max 1.774350e+00
TIMESTEP 130643 / e 0.05 / Agent_score 11 / bar2_score 11 /reward -1 / Q_max -3.276246e-01
TIMESTEP 130772 / e 0.05

TIMESTEP 141374 / e 0.05 / Agent_score 14 / bar2_score 13 /reward 1 / Q_max 1.525876e+00
TIMESTEP 141503 / e 0.05 / Agent_score 14 / bar2_score 14 /reward -1 / Q_max -2.069619e-01
TIMESTEP 141633 / e 0.05 / Agent_score 15 / bar2_score 14 /reward 1 / Q_max 1.586172e+00
TIMESTEP 141762 / e 0.05 / Agent_score 15 / bar2_score 15 /reward -1 / Q_max -1.987020e-01
TIMESTEP 141894 / e 0.05 / Agent_score 16 / bar2_score 15 /reward 1 / Q_max 1.303999e+00
TIMESTEP 142023 / e 0.05 / Agent_score 16 / bar2_score 16 /reward -1 / Q_max -2.386514e-01
TIMESTEP 142154 / e 0.05 / Agent_score 17 / bar2_score 16 /reward 1 / Q_max 1.470966e+00
TIMESTEP 142283 / e 0.05 / Agent_score 17 / bar2_score 17 /reward -1 / Q_max -1.933719e-01
TIMESTEP 142412 / e 0.05 / Agent_score 18 / bar2_score 17 /reward 1 / Q_max 1.696257e+00
TIMESTEP 142541 / e 0.05 / Agent_score 18 / bar2_score 18 /reward -1 / Q_max -2.449031e-01
TIMESTEP 142670 / e 0.05 / Agent_score 19 / bar2_score 18 /reward 1 / Q_max 1.655874e+00
TIMESTEP 14

TIMESTEP 153119 / e 0.05 / Agent_score 2 / bar2_score 2 /reward 1 / Q_max 1.469285e+00
TIMESTEP 153248 / e 0.05 / Agent_score 2 / bar2_score 3 /reward -1 / Q_max -7.084010e-02
TIMESTEP 153377 / e 0.05 / Agent_score 3 / bar2_score 3 /reward 1 / Q_max 1.588158e+00
TIMESTEP 153506 / e 0.05 / Agent_score 3 / bar2_score 4 /reward -1 / Q_max -1.173121e-01
TIMESTEP 153638 / e 0.05 / Agent_score 4 / bar2_score 4 /reward 1 / Q_max 1.228403e+00
TIMESTEP 153767 / e 0.05 / Agent_score 4 / bar2_score 5 /reward -1 / Q_max -1.328523e-01
TIMESTEP 153898 / e 0.05 / Agent_score 5 / bar2_score 5 /reward 1 / Q_max 1.489686e+00
TIMESTEP 154027 / e 0.05 / Agent_score 5 / bar2_score 6 /reward -1 / Q_max -1.047464e-01
TIMESTEP 154156 / e 0.05 / Agent_score 6 / bar2_score 6 /reward 1 / Q_max 1.604224e+00
TIMESTEP 154285 / e 0.05 / Agent_score 6 / bar2_score 7 /reward -1 / Q_max -1.019667e-01
TIMESTEP 154414 / e 0.05 / Agent_score 7 / bar2_score 7 /reward 1 / Q_max 1.620031e+00
TIMESTEP 154543 / e 0.05 / Agent_

TIMESTEP 165155 / e 0.05 / Agent_score 9 / bar2_score 10 /reward -1 / Q_max -2.035967e-01
TIMESTEP 165285 / e 0.05 / Agent_score 10 / bar2_score 10 /reward 1 / Q_max 1.377574e+00
TIMESTEP 165414 / e 0.05 / Agent_score 10 / bar2_score 11 /reward -1 / Q_max -2.590698e-01
TIMESTEP 165544 / e 0.05 / Agent_score 11 / bar2_score 11 /reward 1 / Q_max 1.291903e+00
TIMESTEP 165673 / e 0.05 / Agent_score 11 / bar2_score 12 /reward -1 / Q_max -3.186636e-01
TIMESTEP 165804 / e 0.05 / Agent_score 12 / bar2_score 12 /reward 1 / Q_max 1.303915e+00
TIMESTEP 165933 / e 0.05 / Agent_score 12 / bar2_score 13 /reward -1 / Q_max -3.050946e-01
TIMESTEP 166063 / e 0.05 / Agent_score 13 / bar2_score 13 /reward 1 / Q_max 1.366758e+00
TIMESTEP 166192 / e 0.05 / Agent_score 13 / bar2_score 14 /reward -1 / Q_max -1.928939e-01
TIMESTEP 166322 / e 0.05 / Agent_score 14 / bar2_score 14 /reward 1 / Q_max 1.420093e+00
TIMESTEP 166451 / e 0.05 / Agent_score 14 / bar2_score 15 /reward -1 / Q_max -1.913416e-01
TIMESTEP 1

TIMESTEP 176899 / e 0.05 / Agent_score 17 / bar2_score 17 /reward -1 / Q_max -2.367919e-01
TIMESTEP 177028 / e 0.05 / Agent_score 18 / bar2_score 17 /reward 1 / Q_max 1.322184e+00
TIMESTEP 177157 / e 0.05 / Agent_score 18 / bar2_score 18 /reward -1 / Q_max -2.740586e-01
TIMESTEP 177286 / e 0.05 / Agent_score 19 / bar2_score 18 /reward 1 / Q_max 1.315819e+00
TIMESTEP 177415 / e 0.05 / Agent_score 19 / bar2_score 19 /reward -1 / Q_max -3.026814e-01
TIMESTEP 177544 / e 0.05 / Agent_score 0 / bar2_score 0 /reward 1 / Q_max 1.217429e+00
TIMESTEP 177673 / e 0.05 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max -1.800943e-01
TIMESTEP 177804 / e 0.05 / Agent_score 1 / bar2_score 1 /reward 1 / Q_max 1.336175e+00
TIMESTEP 177933 / e 0.05 / Agent_score 1 / bar2_score 2 /reward -1 / Q_max -1.981848e-01
TIMESTEP 178063 / e 0.05 / Agent_score 2 / bar2_score 2 /reward 1 / Q_max 1.274050e+00
TIMESTEP 178192 / e 0.05 / Agent_score 2 / bar2_score 3 /reward -1 / Q_max -1.838222e-01
TIMESTEP 178323 / e 0

TIMESTEP 188780 / e 0.05 / Agent_score 4 / bar2_score 6 /reward 1 / Q_max 1.245993e+00
TIMESTEP 188909 / e 0.05 / Agent_score 4 / bar2_score 7 /reward -1 / Q_max -3.653760e-01
TIMESTEP 189040 / e 0.05 / Agent_score 5 / bar2_score 7 /reward 1 / Q_max 1.233884e+00
TIMESTEP 189421 / e 0.05 / Agent_score 6 / bar2_score 7 /reward 1 / Q_max 1.169508e+00
TIMESTEP 189550 / e 0.05 / Agent_score 6 / bar2_score 8 /reward -1 / Q_max -2.978325e-01
TIMESTEP 189681 / e 0.05 / Agent_score 7 / bar2_score 8 /reward 1 / Q_max 1.238707e+00
TIMESTEP 189810 / e 0.05 / Agent_score 7 / bar2_score 9 /reward -1 / Q_max -3.126276e-01
TIMESTEP 189941 / e 0.05 / Agent_score 8 / bar2_score 9 /reward 1 / Q_max 1.263908e+00
TIMESTEP 190070 / e 0.05 / Agent_score 8 / bar2_score 10 /reward -1 / Q_max -3.110633e-01
TIMESTEP 190201 / e 0.05 / Agent_score 9 / bar2_score 10 /reward 1 / Q_max 1.290137e+00
TIMESTEP 190330 / e 0.05 / Agent_score 9 / bar2_score 11 /reward -1 / Q_max -1.856660e-01
TIMESTEP 190461 / e 0.05 / Age

TIMESTEP 201848 / e 0.05 / Agent_score 14 / bar2_score 12 /reward -1 / Q_max -3.947440e-01
TIMESTEP 201979 / e 0.05 / Agent_score 15 / bar2_score 12 /reward 1 / Q_max 1.250855e+00
TIMESTEP 202108 / e 0.05 / Agent_score 15 / bar2_score 13 /reward -1 / Q_max -3.671338e-01
TIMESTEP 202237 / e 0.05 / Agent_score 16 / bar2_score 13 /reward 1 / Q_max 1.235310e+00
TIMESTEP 202366 / e 0.05 / Agent_score 16 / bar2_score 14 /reward -1 / Q_max -3.740446e-01
TIMESTEP 202495 / e 0.05 / Agent_score 17 / bar2_score 14 /reward 1 / Q_max 1.206323e+00
TIMESTEP 202624 / e 0.05 / Agent_score 17 / bar2_score 15 /reward -1 / Q_max -3.092873e-01
TIMESTEP 202753 / e 0.05 / Agent_score 18 / bar2_score 15 /reward 1 / Q_max 1.241654e+00
TIMESTEP 202882 / e 0.05 / Agent_score 18 / bar2_score 16 /reward -1 / Q_max -3.702586e-01
TIMESTEP 203013 / e 0.05 / Agent_score 19 / bar2_score 16 /reward 1 / Q_max 1.231604e+00
TIMESTEP 203142 / e 0.05 / Agent_score 19 / bar2_score 17 /reward -1 / Q_max -3.966119e-01
TIMESTEP 

TIMESTEP 215915 / e 0.05 / Agent_score 3 / bar2_score 1 /reward 1 / Q_max 1.112023e+00
TIMESTEP 216296 / e 0.05 / Agent_score 4 / bar2_score 1 /reward 1 / Q_max 1.172038e+00
TIMESTEP 216425 / e 0.05 / Agent_score 4 / bar2_score 2 /reward -1 / Q_max 1.052511e-01
TIMESTEP 216557 / e 0.05 / Agent_score 5 / bar2_score 2 /reward 1 / Q_max 1.203870e+00
TIMESTEP 216686 / e 0.05 / Agent_score 5 / bar2_score 3 /reward -1 / Q_max 2.104497e-01
TIMESTEP 216815 / e 0.05 / Agent_score 6 / bar2_score 3 /reward 1 / Q_max 1.210877e+00
TIMESTEP 216944 / e 0.05 / Agent_score 6 / bar2_score 4 /reward -1 / Q_max -4.556108e-02
TIMESTEP 217073 / e 0.05 / Agent_score 7 / bar2_score 4 /reward 1 / Q_max 1.333022e+00
TIMESTEP 217202 / e 0.05 / Agent_score 7 / bar2_score 5 /reward -1 / Q_max -2.501012e-01
TIMESTEP 217332 / e 0.05 / Agent_score 8 / bar2_score 5 /reward 1 / Q_max 1.202731e+00
TIMESTEP 217715 / e 0.05 / Agent_score 9 / bar2_score 5 /reward 1 / Q_max 1.148233e+00
TIMESTEP 218098 / e 0.05 / Agent_scor

TIMESTEP 235119 / e 0.05 / Agent_score 1 / bar2_score 2 /reward 1 / Q_max 1.544180e+00
TIMESTEP 235248 / e 0.05 / Agent_score 1 / bar2_score 3 /reward -1 / Q_max -3.198918e-01
TIMESTEP 235377 / e 0.05 / Agent_score 2 / bar2_score 3 /reward 1 / Q_max 1.303283e+00
TIMESTEP 235760 / e 0.05 / Agent_score 3 / bar2_score 3 /reward 1 / Q_max 1.364343e+00
TIMESTEP 235889 / e 0.05 / Agent_score 3 / bar2_score 4 /reward -1 / Q_max -2.393885e-02
TIMESTEP 236018 / e 0.05 / Agent_score 4 / bar2_score 4 /reward 1 / Q_max 1.545796e+00
TIMESTEP 236147 / e 0.05 / Agent_score 4 / bar2_score 5 /reward -1 / Q_max -1.220307e-01
TIMESTEP 236276 / e 0.05 / Agent_score 5 / bar2_score 5 /reward 1 / Q_max 1.400246e+00
TIMESTEP 236659 / e 0.05 / Agent_score 6 / bar2_score 5 /reward 1 / Q_max 1.443515e+00
TIMESTEP 236788 / e 0.05 / Agent_score 6 / bar2_score 6 /reward -1 / Q_max 4.576284e-01
TIMESTEP 236918 / e 0.05 / Agent_score 7 / bar2_score 6 /reward 1 / Q_max 1.475548e+00
TIMESTEP 237215 / e 0.05 / Agent_sco

TIMESTEP 252565 / e 0.05 / Agent_score 13 / bar2_score 15 /reward -1 / Q_max -2.900866e-01
TIMESTEP 252611 / e 0.05 / Agent_score 13 / bar2_score 16 /reward -1 / Q_max 4.002863e-01
TIMESTEP 252657 / e 0.05 / Agent_score 13 / bar2_score 17 /reward -1 / Q_max -6.734802e-01
TIMESTEP 252703 / e 0.05 / Agent_score 13 / bar2_score 18 /reward -1 / Q_max 1.344859e-01
TIMESTEP 252749 / e 0.05 / Agent_score 13 / bar2_score 19 /reward -1 / Q_max -5.100574e-01
TIMESTEP 252795 / e 0.05 / Agent_score 0 / bar2_score 0 /reward -1 / Q_max -6.975428e-02
TIMESTEP 252841 / e 0.05 / Agent_score 0 / bar2_score 1 /reward -1 / Q_max -3.425572e-01
TIMESTEP 252970 / e 0.05 / Agent_score 1 / bar2_score 1 /reward 1 / Q_max 1.637216e+00
TIMESTEP 253351 / e 0.05 / Agent_score 2 / bar2_score 1 /reward 1 / Q_max 1.615626e+00
TIMESTEP 253480 / e 0.05 / Agent_score 2 / bar2_score 2 /reward -1 / Q_max 3.823766e-01
TIMESTEP 253611 / e 0.05 / Agent_score 3 / bar2_score 2 /reward 1 / Q_max 1.659211e+00
TIMESTEP 253995 / e 

TIMESTEP 268336 / e 0.05 / Agent_score 3 / bar2_score 8 /reward 1 / Q_max 1.644511e+00
TIMESTEP 268717 / e 0.05 / Agent_score 4 / bar2_score 8 /reward 1 / Q_max 1.703745e+00
TIMESTEP 268846 / e 0.05 / Agent_score 4 / bar2_score 9 /reward -1 / Q_max -2.824275e-01
TIMESTEP 268975 / e 0.05 / Agent_score 5 / bar2_score 9 /reward 1 / Q_max 1.551964e+00
TIMESTEP 269358 / e 0.05 / Agent_score 6 / bar2_score 9 /reward 1 / Q_max 1.523509e+00
TIMESTEP 269655 / e 0.05 / Agent_score 6 / bar2_score 10 /reward -1 / Q_max 4.684383e-01
TIMESTEP 269701 / e 0.05 / Agent_score 6 / bar2_score 11 /reward -1 / Q_max -5.752181e-01
TIMESTEP 269832 / e 0.05 / Agent_score 7 / bar2_score 11 /reward 1 / Q_max 1.503311e+00
TIMESTEP 270215 / e 0.05 / Agent_score 8 / bar2_score 11 /reward 1 / Q_max 1.405236e+00
TIMESTEP 270512 / e 0.05 / Agent_score 8 / bar2_score 12 /reward -1 / Q_max 3.123428e-01
TIMESTEP 270558 / e 0.05 / Agent_score 8 / bar2_score 13 /reward -1 / Q_max -6.180794e-01
TIMESTEP 270604 / e 0.05 / Ag

TIMESTEP 287090 / e 0.05 / Agent_score 5 / bar2_score 5 /reward -1 / Q_max -4.636542e-01
TIMESTEP 287219 / e 0.05 / Agent_score 6 / bar2_score 5 /reward 1 / Q_max 1.584458e+00
TIMESTEP 287600 / e 0.05 / Agent_score 7 / bar2_score 5 /reward 1 / Q_max 1.726965e+00
TIMESTEP 287898 / e 0.05 / Agent_score 7 / bar2_score 6 /reward -1 / Q_max 3.824939e-01
TIMESTEP 287944 / e 0.05 / Agent_score 7 / bar2_score 7 /reward -1 / Q_max -5.733711e-01
TIMESTEP 288073 / e 0.05 / Agent_score 8 / bar2_score 7 /reward 1 / Q_max 1.452009e+00
TIMESTEP 288454 / e 0.05 / Agent_score 9 / bar2_score 7 /reward 1 / Q_max 1.645199e+00
TIMESTEP 288835 / e 0.05 / Agent_score 10 / bar2_score 7 /reward 1 / Q_max 1.670816e+00
TIMESTEP 289216 / e 0.05 / Agent_score 11 / bar2_score 7 /reward 1 / Q_max 1.791390e+00
TIMESTEP 289514 / e 0.05 / Agent_score 11 / bar2_score 8 /reward -1 / Q_max 4.374401e-01
TIMESTEP 289560 / e 0.05 / Agent_score 11 / bar2_score 9 /reward -1 / Q_max -6.370921e-01
TIMESTEP 289689 / e 0.05 / Agen

TIMESTEP 316472 / e 0.05 / Agent_score 4 / bar2_score 0 /reward 1 / Q_max 1.576433e+00
TIMESTEP 316853 / e 0.05 / Agent_score 5 / bar2_score 0 /reward 1 / Q_max 1.558639e+00
TIMESTEP 317234 / e 0.05 / Agent_score 6 / bar2_score 0 /reward 1 / Q_max 1.573164e+00
TIMESTEP 317615 / e 0.05 / Agent_score 7 / bar2_score 0 /reward 1 / Q_max 1.573079e+00
TIMESTEP 317744 / e 0.05 / Agent_score 7 / bar2_score 1 /reward -1 / Q_max -4.580358e-01
TIMESTEP 317873 / e 0.05 / Agent_score 8 / bar2_score 1 /reward 1 / Q_max 1.505062e+00
TIMESTEP 318254 / e 0.05 / Agent_score 9 / bar2_score 1 /reward 1 / Q_max 1.554004e+00
TIMESTEP 318635 / e 0.05 / Agent_score 10 / bar2_score 1 /reward 1 / Q_max 1.539757e+00
TIMESTEP 319018 / e 0.05 / Agent_score 11 / bar2_score 1 /reward 1 / Q_max 1.513789e+00
TIMESTEP 319399 / e 0.05 / Agent_score 12 / bar2_score 1 /reward 1 / Q_max 1.505475e+00
TIMESTEP 319780 / e 0.05 / Agent_score 13 / bar2_score 1 /reward 1 / Q_max 1.579964e+00
TIMESTEP 320161 / e 0.05 / Agent_scor

TIMESTEP 348281 / e 0.05 / Agent_score 8 / bar2_score 0 /reward 1 / Q_max 1.337619e+00
TIMESTEP 348662 / e 0.05 / Agent_score 9 / bar2_score 0 /reward 1 / Q_max 1.364239e+00
TIMESTEP 349045 / e 0.05 / Agent_score 10 / bar2_score 0 /reward 1 / Q_max 1.436374e+00
TIMESTEP 349428 / e 0.05 / Agent_score 11 / bar2_score 0 /reward 1 / Q_max 1.304887e+00
TIMESTEP 349809 / e 0.05 / Agent_score 12 / bar2_score 0 /reward 1 / Q_max 1.316579e+00
TIMESTEP 350192 / e 0.05 / Agent_score 13 / bar2_score 0 /reward 1 / Q_max 1.359765e+00
TIMESTEP 350573 / e 0.05 / Agent_score 14 / bar2_score 0 /reward 1 / Q_max 1.345344e+00
TIMESTEP 350955 / e 0.05 / Agent_score 15 / bar2_score 0 /reward 1 / Q_max 1.324572e+00
TIMESTEP 351336 / e 0.05 / Agent_score 16 / bar2_score 0 /reward 1 / Q_max 1.264039e+00
TIMESTEP 351717 / e 0.05 / Agent_score 17 / bar2_score 0 /reward 1 / Q_max 1.298624e+00
TIMESTEP 352099 / e 0.05 / Agent_score 18 / bar2_score 0 /reward 1 / Q_max 1.292305e+00
TIMESTEP 352480 / e 0.05 / Agent_s