In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
import numpy as np
from threading import Thread, Lock
from multiprocessing import cpu_count
import copy
# default use float32 in conda env
# tf.keras.backend.set_floatx('float64')

In [2]:
# set visible GPU
gpu_number = 0

# set seed number
seed = 2021

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[gpu_number], 'GPU')
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
gamma = 0.99
update_interval = 5
actor_lr = 0.0005
critic_lr = 0.001
save_model_episode = 500
show_gif_every_episode = 100

In [4]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line disable pop-out window
from ple.games.flappybird import FlappyBird
from ple import PLE

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [5]:
CUR_EPISODE = 0

class Actor:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(actor_lr)
        self.entropy_beta = 0.01

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(64, activation='relu'),
            Dense(8, activation='relu'),
            Dense(self.action_dim, activation='softmax')
        ])
    
    def compute_loss(self, actions, logits, advantages):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = ce_loss(actions, logits, sample_weight=tf.stop_gradient(advantages))

        entropy_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        entropy = entropy_loss(logits, logits)

        # ppt page48 solution to pitfall:exploration
        return policy_loss - self.entropy_beta * entropy 

    def TA_state(self, game):
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']
        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state
    
    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(actions, logits, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [6]:
class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(critic_lr)
    
    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(8, activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        # ppt page47 update fV_pi
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


In [7]:
class Agent:
    def __init__(self):
        game = FlappyBird()
        env = PLE(game, fps=30, display_screen=False, rng=seed)  # game environment interface
        env.reset_game()

        self.state_dim = len(self.TA_state(game)[0])
        self.action_dim = len(env.getActionSet()) # number of actions

        self.global_actor = Actor(self.state_dim, self.action_dim)
        self.global_critic = Critic(self.state_dim)
        self.num_workers = cpu_count() # 16 for R7-5800X

    def TA_state(self, game):
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']
        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state

    def train(self, max_episodes=20000):
        workers = []

        for _ in range(self.num_workers):
            game = FlappyBird()
            env = PLE(game, fps=30, display_screen=False, rng=seed)  # game environment interface
            env.reset_game()

            workers.append(WorkerAgent(game, env, self.global_actor, self.global_critic, max_episodes))
            
        for worker in workers:
            worker.start()
        
        for worker in workers:
            worker.join()

In [8]:
import moviepy.editor as mpy


def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [9]:
class WorkerAgent(Thread):
    def __init__(self, game, env, global_actor, global_critic, max_episodes):
        Thread.__init__(self)
        self.lock = Lock()
        self.game = game
        self.env = env
        self.state_dim = len(self.TA_state(self.game)[0])
        self.action_dim = len(self.env.getActionSet())

        self.max_episodes = max_episodes
        self.global_actor = global_actor
        self.global_critic = global_critic
        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim)

        self.actor.model.set_weights(self.global_actor.model.get_weights())
        self.critic.model.set_weights(self.global_critic.model.get_weights())

    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value # estimate of fVpi(t+1)

        for k in reversed(range(0, len(rewards))):
            cumulative = gamma * cumulative + rewards[k] # ppt page 47 紅字, estimate fQpi
            td_targets[k] = cumulative
        return td_targets

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def TA_state(self, game):
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']
        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state

    def train(self):
        global CUR_EPISODE

        while CUR_EPISODE < self.max_episodes:
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, done = 0, False
            
            # Reset the environment
            self.env.reset_game()
            state = self.TA_state(self.game)
            
            if CUR_EPISODE % show_gif_every_episode == 0:
            # record frame
                frames = [self.env.getScreenRGB()]
            
            while not done:
                probs = self.actor.model.predict(
                    np.reshape(state, [1, self.state_dim]))
#                 if(CUR_EPISODE%16==1):
#                     print(probs[0]) #debug
                action = np.random.choice(self.action_dim,p=probs[0])

                reward = self.env.act(self.env.getActionSet()[action])
                done = self.env.game_over()
                
                if CUR_EPISODE % show_gif_every_episode == 0:
                    frames.append(self.env.getScreenRGB())
                
                next_state = self.TA_state(self.game)  # get next state

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                
                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)

                if(len(state_batch) >= update_interval or done):
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)

                    next_v_value = self.critic.model.predict(next_state) # fVpi(t+1)
                    td_targets = self.n_step_td_target(rewards, next_v_value, done)
                    advantages = td_targets - self.critic.model.predict(states)

                    with self.lock:
                        actor_loss = self.global_actor.train(
                            states, actions, advantages)
                        critic_loss = self.global_critic.train(
                            states, td_targets)
                        print("EPOCH",CUR_EPISODE,"===\n",self.global_critic.model.get_weights()[2][2]) #debug
                        self.actor.model.set_weights(
                            self.global_actor.model.get_weights())
                        self.critic.model.set_weights(
                            self.global_critic.model.get_weights())

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    # td_target_batch = []
                    # advatnage_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]
            
#             if CUR_EPISODE % save_model_episode == 0:
#                 self.global_actor.model.save("models/ep%d"%CUR_EPISODE)

            print('EP{} EpisodeReward={}'.format(CUR_EPISODE, episode_reward))
            CUR_EPISODE += 1
        
            # for every 5000 episode, record an animation
            if CUR_EPISODE % show_gif_every_episode == 0:
                print("len frames:", len(frames))
                clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
                display(clip.ipython_display(fps=60, autoplay=1, loop=1))

    def run(self):
        self.train()

In [10]:
agent = Agent()

In [11]:
agent.global_critic.model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                576       
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 520       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
Total params: 5,265
Trainable params: 5,265
Non-trainable params: 0
_________________________________________________________________


In [None]:
agent.train()





EPOCHEPOCH  0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]EPOCH0
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919

EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306

EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306

EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306

EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306

EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306

EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 0 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306

  0.03859685 -0.16334778 -0.11117987  0.02915496] 2
 EP2 EpisodeReward=-5.0
===
 EPOCH 2 ===
EP3 EpisodeReward=-5.0 
[-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
[-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19

EP12 EpisodeReward=-5.0
EPOCH 13 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP13 EpisodeReward=-5.0
EPOCH 14 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.0605091

EPOCH 16 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 16 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893

EPOCH 16 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 16 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893

EPOCH 16 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 16 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893

EPOCH 23 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 23 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893

EPOCH 28 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP28 EpisodeReward=-5.0
EP29 EpisodeReward=-5.0
EP30 EpisodeReward=-5.0
EPOCH 30 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.1951782

 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 32 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.2115668

  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 32 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 32 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050

EP32 EpisodeReward=-5.0
EPOCH 33 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP33 EpisodeReward=-5.0EPOCH 33 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913

EP36 EpisodeReward=-5.0EP36 EpisodeReward=-5.0

EPOCH 38 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP38 EpisodeReward=-5.0
EPOCH 39 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.1951782

EPOCH 46 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP47 EpisodeReward=-5.0
EPOCH 48 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.058245

EPOCH 48 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 48 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893

EPOCH 48 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 48 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893

EPOCH 48 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP48 EpisodeReward=-5.0EP48 EpisodeReward=-5.0

EPOCH 50 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.0605091

EPOCH 55 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EP55 EpisodeReward=-5.0
EPOCH 56 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.058245

EPOCH 62 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.05289306 -0.21156687  0.09330501
 -0.11786419  0.17279778 -0.1348725  -0.11327261 -0.08971746  0.13296129
  0.21517392  0.12396677 -0.02943406  0.10683323  0.08933668 -0.06840609
  0.21426545 -0.02961157  0.07317729 -0.15623146 -0.07100523  0.07948665
  0.20227097 -0.2068994  -0.13898747 -0.13960798  0.05290847  0.10206147
  0.18471535  0.11596467  0.15171696  0.0909241   0.00385311  0.04667293
  0.01530503 -0.11290989  0.13922758 -0.21212621  0.166036    0.02870829
 -0.02212478  0.09766181  0.15775444  0.11842634 -0.01552063 -0.10443769
  0.03859685 -0.16334778 -0.11117987  0.02915496]
EPOCH 62 ===
 [-0.05494748  0.02549103  0.17751871 -0.12631275  0.20204242  0.14903022
  0.11221592 -0.12057868 -0.19445285 -0.19517826 -0.09749343  0.06050913
 -0.10373036 -0.05824564 -0.18168919  0.052893