In [49]:
import tensorflow as tf
from tensorflow.keras import layers, initializers, models, optimizers
from scipy.special import softmax

import numpy as np

import gym

tf.random.set_seed(42)
np.random.seed(42)

# Objective: Create a DDPG algorithm with a GPT as the Actor network.   

In [50]:
#Ornstein-Uhlenbeck Noise 
class OUActionNoise(object):
    def __init__(self, mean, sigma=0.5, theta=0.2, dt=0.4, x0=None):
        self.mean = mean
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()
    
    #--------------------------------------------------------------------------------
    #Method that enables to write classes where the instances behave like functions and can be called like a function.    
    def __call__(self):
        x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        self.x_prev = x
        
        return x
    
    #--------------------------------------------------------------------------------
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mean)
        return

In [51]:
%%script false --no-raise-error

a = np.zeros(15)
b = OUActionNoise(a)
a += b()
a

In [52]:
#Replay Buffer 
class ReplayBuffer(object):
    def __init__(self, size, batch_size, block_size):
        '''
        Args:
            size (integer): The size of the replay buffer.              
            batch_size (integer): The batch size.
            block_size (integer): 
        '''
        self.buffer = []
        self.batch_size = batch_size
        self.max_size = size
        self.block_size = block_size
        
    #--------------------------------------------------------------------------------    
    def append(self, steps):
        '''
        Args:
            state (Numpy array): The state.              
            action (integer): The action.
            reward (float): The reward.
            done (boolen): True if the next state is a terminal state and False otherwise. Is transformed to integer so tha True = 1, False = 0
            next_state (Numpy array): The next state.           
        '''
        if self.size == self.max_size: del self.buffer[:len(steps)]
        for step in steps: self.buffer.append(step)

    #--------------------------------------------------------------------------------
    def clear(self):
        self.buffer.clear()
    
    #--------------------------------------------------------------------------------    
    def getBlock(self):
        '''
        Returns:
            A list of transition tuples including state, action, reward, terminal, and next_state
        '''
        # gets the random indexes of the block start
        options = np.arange(self.size - self.block_size)
        prob_diff = 1e-4
        probs = softmax(np.arange(1-prob_diff, 1, (prob_diff)/len(options))[:len(options)])
        
        idxs = np.random.choice(options, size=(self.batch_size,), replace=False, p=probs)
        get_data = lambda i, idx: [mem[i] for mem in self.buffer[idx:idx+self.block_size]]
        
        # generate the batch by stacking the blocks
        states = np.array(np.stack([get_data(0, idx) for idx in idxs], axis=0))
        actions = np.array(np.stack([get_data(1, idx) for idx in idxs], axis=0))
        rewards = np.array(np.stack([get_data(2, idx) for idx in idxs], axis=0))
        next_states = np.array(np.stack([get_data(3, idx) for idx in idxs], axis=0))
        done = np.array(np.stack([get_data(4, idx) for idx in idxs], axis=0))
        positions = np.array(np.stack([get_data(5, idx) for idx in idxs], axis=0))
        new_positions = np.array(np.stack([get_data(6, idx) for idx in idxs], axis=0))

        return  states, actions, rewards, next_states, done, positions, new_positions
    
    #--------------------------------------------------------------------------------  
    @property  
    def size(self):
        '''
        Returns:
            Number of elements in the buffer
        '''
        return len(self.buffer)
    
    #--------------------------------------------------------------------------------
    @property 
    def hasMinLength(self):
        '''
        Returns:
            Boolean indicating if the memory have the minimum number of elements or not
        '''
        return (self.size >= (self.batch_size + self.block_size))
    
    #--------------------------------------------------------------------------------
    @property  
    def data(self):
        '''
        Returns:
            List with all the elements in the buffer
        '''
        return self.buffer

In [53]:
gpt_kernel_initializer = lambda: initializers.RandomNormal(mean=0.0, stddev=0.1)
gpt_bias_initializer = lambda: initializers.Zeros()

In [54]:
# Individual Head of self-attention
class Head(layers.Layer):
    """ one head of self-attention """
    def __init__(self, batch_size, block_size, state_dim, head_size, dropout):
        super().__init__()
        self.batch_size = batch_size
        self.block_size = block_size
        self.state_dim = state_dim
        # key, query and value layers
        self.key = layers.Dense(units=head_size, use_bias=False, kernel_initializer=gpt_kernel_initializer())
        self.query = layers.Dense(units=head_size, use_bias=False, kernel_initializer=gpt_kernel_initializer())
        self.value = layers.Dense(units=head_size, use_bias=False, kernel_initializer=gpt_kernel_initializer())
        # dropout layer
        self.dropout = layers.Dropout(dropout)

    def call(self, x, training=False):
        B, T, C = x.shape
        if(B is None): B = self.batch_size 
        if(T is None): T = self.block_size
        if(C is None): C = self.state_dim
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities") - C**-0.5 is for normalization
        wei =  tf.matmul(q, tf.transpose(k, perm=[0, 2, 1]))  * tf.math.rsqrt(tf.cast(C, tf.float32)) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = tf.where(tf.linalg.band_part(tf.ones((T, T)), -1, 0) == 0, tf.constant(float("-inf"), shape=(B, T, T)), wei) # (B, T, T)
        wei = tf.nn.softmax(wei, axis=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = tf.matmul(wei, v) # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [55]:
# Layer with multiple self-attention Heads for data communication 
class MultiHeadAttention(layers.Layer):
    """ multiple heads of self-attention in parallel """
    def __init__(self, batch_size, block_size, state_dim, num_heads, head_size, dropout):
        super().__init__()
        self.heads = [Head(batch_size, block_size, state_dim, head_size, dropout) for _ in range(num_heads)]
        # this linear layer is used to 'merge' the multiple heads acquired knowledge
        self.proj = layers.Dense(units=state_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer())
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        # concatenate the heads outputs in the C dimension
        out =  tf.concat([h(x) for h in self.heads], axis=-1)
        # apply the projection and the dropout
        out = self.dropout(self.proj(out))
        return out

In [56]:
#Simple feed forward for data computation
class FeedForward(layers.Layer):
    def __init__(self, state_dim, dropout, last_resize=True, spread_dim=None):
        super().__init__()
        last_layer = [
            layers.Dense(state_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer()), 
            layers.Dropout(dropout)
        ] if last_resize else []
        
        self.net = models.Sequential([
            layers.Dense(spread_dim if spread_dim is not None else 4 * state_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer()),
            layers.Dropout(dropout),
            *last_layer
        ])

    def call(self, x):
        return self.net(x)

In [57]:
# Block containing a multi head attention module and a feed forward linear computation
class Block(layers.Layer):
    """ Transformer block: communication followed by computation """
    def __init__(self, batch_size, block_size, state_dim, num_heads, dropout, last_resize, spread_dim):
        super().__init__()
        head_size = state_dim // num_heads # each head gets a portion of the embeddings so different relations can be learned
        self.sa = MultiHeadAttention(batch_size, block_size, state_dim, num_heads, head_size, dropout)
        self.ffwd = FeedForward(state_dim, dropout, last_resize, spread_dim)
        self.ln1 = layers.LayerNormalization()

    def call(self, x):
        # Multi head attention with layer norm
        x = x + self.sa(self.ln1(x))
        # feed forward with layer norm
        x = self.ffwd(x)
        return x

In [58]:
f_value = lambda : initializers.RandomNormal(mean=0.0, stddev=0.1)

In [59]:
class GPTModel(models.Model):
    def __init__(self, n_layer, batch_size, block_size, embedding_dim, out_dim, num_heads, dropout, ffw):
        super().__init__()
        self.block_size = block_size
        self.state_embedding = layers.Dense(units=embedding_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer())
        self.position_embedding = layers.Dense(units=embedding_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer())
        self.blocks = models.Sequential([
            Block(
                batch_size, 
                block_size, 
                embedding_dim, 
                num_heads, 
                dropout, 
                last_resize = (i != n_layer - 1 ),  
                spread_dim = out_dim if (i == n_layer - 1 ) else None,
            )for i in range(n_layer)
        ])
        self.ffw = ffw

    def call(self, inp, training=False):
        inputs, positions = inp[0], inp[1]
        B, T, C = inputs.shape
        if(T is None): T = self.block_size
        pos_emb = self.position_embedding(positions)
        x = self.state_embedding(inputs) + pos_emb
        x = self.blocks(x)
        logits = self.ffw(x)
        return logits
    
    def generate(self, states, positions):
        # crop idx to the last block_size tokens
        idx_cond = states[:, -self.block_size:, :]
        pos_cond = positions[:, -self.block_size:, :]
        # get the predictions
        actions = self([idx_cond, pos_cond])
        # focus only on the last time step
        return actions
    
    def build(self, input_shape):
        states, positions = input_shape
        self.position_embedding.build(positions)
        self.state_embedding.build(states)
        super(GPTModel, self).build((2, None, None, None))

In [60]:
class Actor(object):
    def __init__(self, n_layer, batch_size, block_size, state_dim, action_dim, position_dim, embedding_dim, num_heads, dropout, action_range, lr, tau):
        #Network dimensions
        self.inp_dim = state_dim
        self.action_dim = action_dim
        ffw = lambda: models.Sequential([
            layers.Dense(128,  activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(action_dim, activation='tanh'),
            layers.Lambda(lambda i: i * action_range, dtype='float64'),
        ]) 

        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau

        #Generates the optimization function - used in the agent to generate gradients
        self.optimizer = optimizers.Adam(lr)

        #Generates the actor model
        self.model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=128,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.model.build(((None, None, state_dim), (None, None, position_dim)))

        #Generates the actor target model
        self.target_model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=128,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.target_model.build(((None, None, state_dim), (None, None, position_dim)))

        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
    
    #--------------------------------------------------------------------
    def predict(self, states, positions):
        return self.model.generate(states, positions)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, positions):
        return self.target_model.generate(states, positions)
    
    #--------------------------------------------------------------------
    def act(self, states, positions):
        action = self.predict(states, positions)
        # Gets the last action only
        action = action[0, -1, :]
        return action

    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save(path + '_actor_model.h5')
        self.target_model.save(path + '_actor_target_model.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.target_model = models.load_model(path)
        self.model = models.load_model(path)

In [61]:
class Critic(object):
    def __init__(self, n_layer, batch_size, block_size, state_dim, action_dim, position_dim, embedding_dim, out_dim, num_heads, dropout, lr, tau):
        #Network dimensions
        self.inp_dim = state_dim + action_dim
        ffw = lambda: models.Sequential([
                layers.Dense(128,  activation='relu'),
                layers.BatchNormalization(),
                layers.Dropout(dropout),
                layers.Dense(out_dim, activation='linear'),
            ]) 

        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau

        #Generates the optimization function - used in the agent to generate gradients
        self.optimizer = optimizers.Adam(lr)

        #Generates the actor model
        self.model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=128,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.model.build(((None, None, self.inp_dim), (None, None, position_dim)))

        #Generates the actor target model
        self.target_model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=128,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.target_model.build(((None, None, self.inp_dim), (None, None, position_dim)))

        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
    
    #--------------------------------------------------------------------
    def predict(self, states, actions, positions):
        states = tf.cast(states, tf.float32) 
        actions = tf.cast(actions, tf.float32) 
        positions = tf.cast(positions, tf.float32)
        inp = tf.concat([states, actions], 2)
        return self.model.generate(inp, positions)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, actions, positions):
        states = tf.cast(states, tf.float32) 
        actions = tf.cast(actions, tf.float32)
        positions = tf.cast(positions, tf.float32)
        inp = tf.concat([states, actions], 2)
        return self.target_model.generate(inp, positions)
    
    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save(path + '_critic_model.h5')
        self.target_model.save(path + '_critic_target_model.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.target_model = models.load_model(path)
        self.model = models.load_model(path)

In [68]:
class DDPG_GPT_Agent(object):
    def __init__(self, a_n_layers, c_n_layers, batch_size, block_size, state_dim, action_dim, a_n_heads, c_n_heads, 
        dropout, action_min, action_max, memory_size, gamma, a_lr, c_lr, tau, epsilon, epsilon_decay, 
        epsilon_min, position_dim, a_embedding_dim, c_embedding_dim
        ):
        self.batch_size = batch_size
        self.block_size = block_size
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_min = action_min
        self.action_max = action_max
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.position_dim = position_dim

        #Creates the Replay Buffer
        self.memory = ReplayBuffer(memory_size, batch_size, block_size)

        #Creates the noise generator
        self.ou_noise = OUActionNoise(mean=np.zeros(action_dim))

        #Creates the actor
        self.actor = Actor(
            n_layer=a_n_layers,
            batch_size=batch_size, 
            block_size=block_size, 
            state_dim=state_dim, 
            action_dim=action_dim, 
            position_dim=position_dim,
            embedding_dim=a_embedding_dim,
            num_heads=a_n_heads, 
            dropout=dropout, 
            action_range=action_max, 
            lr=a_lr, 
            tau=tau,
        )
        
        #Creates the critic
        self.critic = Critic(
            n_layer=c_n_layers,
            batch_size=batch_size, 
            block_size=block_size, 
            state_dim=state_dim, 
            action_dim=action_dim, 
            position_dim=position_dim,
            embedding_dim=c_embedding_dim,
            out_dim=1,
            num_heads=c_n_heads, 
            dropout=dropout, 
            lr=c_lr, 
            tau=tau,
        )
    
    #--------------------------------------------------------------------     
    def act(self, env):
        step = 1
        done = False

        states = env.reset().reshape(1, 1, -1)
        positions = self.int_to_bin(step).reshape(1, 1, -1)

        while not done:
            env.render()
            # Generating actions for a given group of states 
            action = self.policy(states, positions, explore=False)
            # Apply the action in the environment
            new_state, reward, done, info = env.step(action)
            step += 1
            # Append the new state to the states history
            states = tf.concat((states, new_state.reshape(1, 1, -1)), axis=1)
            positions = tf.concat((positions, self.int_to_bin(step).reshape(1, 1, -1)), axis=1)
        
        return

    #-------------------------------------------------------------------- 
    def policy(self, states, positions, explore=True):
        """ Generates an action from a group of states and add exploration """
        # gets the action
        action = self.actor.act(states, positions)
        # takes the exploration with the epsilon probability
        if explore and np.random.rand() < self.epsilon: action += self.ou_noise()
        # clip the action to be between min and max values
        action = np.clip(action, a_min=self.action_min, a_max=self.action_max)
        action[np.isnan(action)] = 0

        return action
    
    #-------------------------------------------------------------------- 
    def int_to_bin(self, num):
        bin_r = np.array([b for b in np.binary_repr(num, width=self.position_dim)])
        bin_arr = np.zeros(self.position_dim)
        bin_arr[np.where(bin_r == '1')] = 1
        return bin_arr
    
    #--------------------------------------------------------------------
    def record_memories(self, steps):
        mem_fix = 0.8
        steps = np.array(steps, dtype=object)
        if (np.random.rand() >= mem_fix):
            avg = np.average(steps[:, :-1], axis=0, weights=np.absolute(steps[:, 2])) # take the weighted average of the numerical data with the weights being the rewards absolute value
            done = np.any(steps[:, -1]) # check if the mean state is a done state - done states are now added separately to avoid information loss
            avg[4] = np.floor(avg[4]) # make the average step into an integer
            steps = np.array([np.concatenate([avg, [done]])], dtype=object)
        
        step_bin = np.array([self.int_to_bin(int(st)) for st in steps[:, 4]])
        new_step_bin = np.array([self.int_to_bin(int(st + 1)) for st in steps[:, 4]])
        steps = np.delete(steps, 4, 1)
        steps = np.array([[*step, bin_r, n_bin_r] for step, bin_r, n_bin_r in zip(steps, step_bin, new_step_bin)], dtype=object) 
        self.memory.append(steps)
        return
    
    #--------------------------------------------------------------------
    def learn(self, memory_steps):
        """ Append an experience to the memory and replay memory if possible """
        self.record_memories(memory_steps)
        if self.memory.hasMinLength: self.replay_memory()
        return
        
    #--------------------------------------------------------------------    
    def replay_memory(self):
        """ Replay a batch of memories """

        # Get sample block from the replay buffer
        states, actions, rewards, next_states, done, positions, next_positions = self.memory.getBlock()
        
        states = tf.convert_to_tensor(states, dtype='float32')
        actions = tf.convert_to_tensor(actions, dtype='float32')
        next_states = tf.convert_to_tensor(next_states, dtype='float32')
        positions = tf.convert_to_tensor(positions, dtype='float32')
        next_positions = tf.convert_to_tensor(next_positions, dtype='float32')

        rewards = tf.expand_dims(tf.convert_to_tensor(rewards, dtype='float32'), axis=-1)

        done = tf.expand_dims(tf.convert_to_tensor(done, dtype='float32'), axis=-1)

        #Train the critic
        with tf.GradientTape() as tape:
            # Compute the critic target values
            target_actions = self.actor.target_predict(next_states, next_positions)
            predicted_return = self.critic.target_predict(next_states, target_actions, next_positions)
            gamma_values = lambda i: tf.expand_dims(tf.repeat([[self.gamma**(k - i) for k in range(i, rewards.shape[1])]], repeats=self.batch_size, axis=0), axis=-1)
            weighted_next_rewards = lambda i: tf.math.reduce_sum(rewards[:, i:, :] * gamma_values(i), axis=1)
            last_return_weight = lambda i: self.gamma ** (rewards.shape[1] - i)
            last_return = predicted_return[:, -1, :]
            state_done = lambda i: 1 - done[:, i, :]
            y = tf.stack([(weighted_next_rewards(i) + (last_return_weight(i) * last_return * state_done(i))) for i in range(rewards.shape[1])], axis=1)
            #y = rewards + self.gamma * self.critic.target_predict(next_states, target_actions, next_positions) * (1 - done) # y = (B, T, 1)
            # Predict the expected rewsteps += 1ard associated with taking the target predicted action in the state
            critic_value = self.critic.predict(states, actions, positions)
            # Compute the critic loss  
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
            print(critic_loss)
            
        critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(critic_grad, self.critic.model.trainable_variables))
        
        #Train the actor
        with tf.GradientTape() as tape:
            acts = self.actor.predict(states, positions)
            critic_grads = self.critic.predict(states, acts, positions)
            #Used -mean as we want to maximize the value given by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_grads)

        actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grad, self.actor.model.trainable_variables))
            
        #Update the model weights
        self.actor.transferWeights()
        self.critic.transferWeights() 
            
        #Decay the epsilon value
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

    def print_data(self, verbose, episode, step, score):
        if verbose:
            print("\r                                                                                                     ", end="")
            print("\rEpisode: "+str(episode+1)+"\t Step: "+str(step)+"\tReward: "+str(score) ,end="")
        return

    #--------------------------------------------------------------------     
    def train(self, env, num_episodes, step_per_train, verbose, verbose_num, end_on_complete=False, complete_num=1, complete_value=float('inf'), act_after_batch=False):
        scores_history = []
        steps_history = []
        complete = 0
        print("BEGIN\n")
        
        for episode in range(num_episodes):
            done = False
            score, step = 0, 1
            state = env.reset()
            states = state.reshape(1, 1, -1)
            positions = self.int_to_bin(step).reshape(1, 1, -1)
            
            while not done:
                done_step = []
                memory_steps = []
                while not done and (step % step_per_train != 0):
                    action = self.policy(states, positions)
                    self.print_data(verbose, episode, step, score)
                    new_state, reward, done, _ = env.step(action)
                    if (done): done_step.append([state, action, reward, new_state, step, done])
                    else: memory_steps.append([state, action, reward, new_state, step, done])
                    state = new_state
                    step += 1
                    states = tf.concat((states, new_state.reshape(1, 1, -1)), axis=1)
                    positions = tf.concat((positions, self.int_to_bin(step).reshape(1, 1, -1)), axis=1)
                    score += reward
                
                step += 1
                if len(memory_steps) > 0: self.learn(memory_steps)
                if len(done_step) > 0: self.learn(done_step)

            scores_history.append(score)
            steps_history.append(step)
            
            #If the score is bigger or equal than the complete score it add one to the completed number
            if(score >= complete_value):
                complete += 1
                #If the flag is true the agent ends the trainig on the firs complete episode
                if end_on_complete and complete >= complete_num: break
            
            #These information are printed after each verbose_num episodes
            if((episode+1)%verbose_num == 0):
                print("\r                                                                                                          ", end="")
                print("\rEpisodes: ", episode+1, "/", num_episodes, 
                      "\n\tTotal reward: ", np.mean(scores_history[-verbose_num:]), 
                      "\n\tNum. steps: ", np.mean(steps_history[-verbose_num:]), 
                      *["\n\tCompleted: ", complete] if complete_value != float('inf') else '', 
                      "\n--------------------------",
                    )
                
                #If the flag is true the agent act and render the episode after each verbose_num episodes
                if act_after_batch: self.act(env)
                
                #Set the number of completed episodes on the batch to zero
                complete = 0

        print("\nFINISHED")
        
        return scores_history, steps_history
    #--------------------------------------------------------------------     
    def save(self, path):
        self.actor.saveModel(path)
        self.critic.saveModel(path)
    
    #--------------------------------------------------------------------
    def load(self, a_path, c_path):
        self.actor.loadModel(a_path)
        self.critic.loadModel(c_path)

In [69]:
env = gym.make("LunarLander-v2", continuous=True, max_episode_steps=500)
batch_size = 3
block_size = 4
position_dim = 14
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_min = env.action_space.low
action_max = env.action_space.high
dropout = 0.05
memory_size = 500000
gamma = 0.99
epsilon = 1
epsilon_decay = 0.999
epsilon_min = 0.4
tau = 5e-4

# Actor hyperparameter
a_n_layer = 2
a_num_heads = 2
a_embedding_dim = 8
a_learning_rate = 3e-4

# Critic hyperparameter
c_n_layer = 2
c_num_heads = 3
c_embedding_dim = 12
c_learning_rate = 5e-4

agent = DDPG_GPT_Agent(
    a_n_layers = a_n_layer,
    c_n_layers = c_n_layer, 
    batch_size = batch_size, 
    block_size=block_size, 
    state_dim=state_dim, 
    action_dim=action_dim, 
    a_embedding_dim=a_embedding_dim,
    c_embedding_dim=c_embedding_dim,
    a_n_heads=a_num_heads, 
    c_n_heads=c_num_heads,
    dropout=dropout, 
    action_min=action_min, 
    action_max=action_max, 
    memory_size=memory_size, 
    gamma=gamma, 
    a_lr=a_learning_rate, 
    c_lr=c_learning_rate, 
    tau=tau, 
    epsilon=epsilon, 
    epsilon_decay=epsilon_decay, 
    epsilon_min=epsilon_min,
    position_dim=position_dim,
)

  logger.warn(


In [70]:
num_episodes = 5000
step_per_train = 10
verbose = True
verbose_num = 10
act_after_batch = True

scores, steps = agent.train(
    env=env, 
    num_episodes=num_episodes,
    step_per_train=step_per_train,
    verbose=verbose, 
    verbose_num=verbose_num,  
    act_after_batch=act_after_batch,
)
env.close()

BEGIN

Episode: 1	 Step: 19	Reward: -8.162201341488432                                                      tf.Tensor(1.7861376, shape=(), dtype=float32)
Episode: 1	 Step: 29	Reward: 8.008537579551858                                                       tf.Tensor(12.110185, shape=(), dtype=float32)
Episode: 1	 Step: 39	Reward: -5.3936105993790875                                                     tf.Tensor(19.121191, shape=(), dtype=float32)
Episode: 1	 Step: 49	Reward: -14.376468916995094                                                     tf.Tensor(19.901138, shape=(), dtype=float32)
Episode: 1	 Step: 59	Reward: -25.105435869162182                                                     tf.Tensor(18.692211, shape=(), dtype=float32)
Episode: 1	 Step: 69	Reward: -2.1131489593504766                                                     tf.Tensor(14.969699, shape=(), dtype=float32)
Episode: 1	 Step: 79	Reward: 18.48317133915806tf.Tensor(17.02842, shape=(), dtype=float32)           
Episode: 

KeyboardInterrupt: 

In [None]:
env = gym.make("LunarLander-v2", continuous=True)
agent.act(env)
env.close()

In [None]:
import matplotlib.pyplot as plt
plt.plot(scores, np.arange(len(scores)))

NameError: name 'scores' is not defined

In [None]:
env = gym.make("LunarLander-v2", continuous=True)
step = 1
done = False
states = env.reset().reshape(1, 1, -1)
positions = agent.int_to_bin(step).reshape(1, 1, -1)

while not done:
    env.render()
    # Generating actions for a given group of states 
    # gets the action
    action = agent.actor.target_predict(states, positions)
    # Gets the last action only
    action = action[0, -1, :]
    # clip the action to be between min and max values
    action = np.clip(action, a_min=action_min, a_max=action_max)
    action[np.isnan(action)] = 0
    # Apply the action in the environment
    new_state, reward, done, info = env.step(action)
    step += 1
    # Append the new state to the states history
    states = tf.concat((states, new_state.reshape(1, 1, -1)), axis=1)
    positions = tf.concat((positions, agent.int_to_bin(step).reshape(1, 1, -1)), axis=1)

env.close()