In [53]:
import tensorflow as tf
from tensorflow.keras import layers, initializers, models, optimizers
from scipy.special import softmax

import numpy as np

import gym

tf.random.set_seed(42)
np.random.seed(42)

# Objective: Create a DDPG algorithm with a GPT as the Actor network.   

In [54]:
#Ornstein-Uhlenbeck Noise 
class OUActionNoise(object):
    def __init__(self, mean, sigma=0.5, theta=0.2, dt=0.4, x0=None):
        self.mean = mean
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()
    
    #--------------------------------------------------------------------------------
    #Method that enables to write classes where the instances behave like functions and can be called like a function.    
    def __call__(self):
        x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        self.x_prev = x
        
        return x
    
    #--------------------------------------------------------------------------------
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mean)
        return

In [55]:
%%script false --no-raise-error

a = np.zeros(4)
b = OUActionNoise(a)
a += b()
a

In [56]:
#Replay Buffer 
class ReplayBuffer(object):
    def __init__(self, size, batch_size):
        '''
        Args:
            size (integer): The size of the replay buffer.              
            batch_size (integer): The batch size.
            block_size (integer): 
        '''
        self.buffer = [[]]
        self.batch_size = batch_size
        self.max_size = size
        
    #--------------------------------------------------    
    def append(self, steps):
        if self.size >= self.max_size: del self.buffer[0]
        for step in steps: self.buffer[-1].append(step)
        # if done create new episode entry
        if (steps[-1][4]): self.buffer.append([])

    #--------------------------------------------------
    def clear(self):
        self.buffer.clear()
    
    #--------------------------------------------------    
    def getEpisodes(self):
        prob_diff = 1e-2
        probs = softmax(np.arange(1-prob_diff, 1, (prob_diff)/(self.size - 1))[:(self.size - 1)])
        episodes = np.random.choice(np.arange(self.size - 1), size=(self.batch_size,), replace=True)
        return  [self.buffer[episode] for episode in episodes]
    
    #--------------------------------------------------  
    @property  
    def size(self):
        '''
        Returns:
            Number of elements in the buffer
        '''
        return len(self.buffer)

    #--------------------------------------------------  
    @property 
    def hasMinLength(self):
        '''
        Returns:
            Boolean indicating if the memory have the minimum number of elements or not
        '''
        return (self.size >= 8)
    
    #--------------------------------------------------
    @property  
    def data(self):
        '''
        Returns:
            List with all the elements in the buffer
        '''
        return self.buffer

In [57]:
gpt_kernel_initializer = lambda: initializers.RandomNormal(mean=0.0, stddev=0.05)
gpt_bias_initializer = lambda: initializers.Zeros()

In [58]:
# Individual Head of self-attention
class Head(layers.Layer):
    """ one head of self-attention """
    def __init__(self, batch_size, block_size, state_dim, head_size, dropout):
        super().__init__()
        self.batch_size = batch_size
        self.block_size = block_size
        self.state_dim = state_dim

        self.ln_st = layers.LayerNormalization()
        self.ln_pos = layers.LayerNormalization()
        # key, query and value layers
        self.key = layers.Dense(units=head_size, use_bias=False, kernel_initializer=gpt_kernel_initializer())
        #self.query = layers.Dense(units=head_size, use_bias=False, kernel_initializer=gpt_kernel_initializer())
        self.value = layers.Dense(units=head_size, use_bias=False, kernel_initializer=gpt_kernel_initializer())
        # dropout layer
        self.dropout = layers.Dropout(dropout)

    #--------------------------------------------------
    def call(self, inp, training=False):
        x, pos_emb = inp[0], inp[1]
        B, T, C = x.shape
        if(B is None): B = self.batch_size 
        if(T is None): T = self.block_size
        if(C is None): C = self.state_dim
        pos = self.ln_pos(pos_emb)
        k = self.key(pos)   # (B,T,C)
        #q = self.query(pos) # (B,T,C)
        # compute attention scores ("affinities") - C**-0.5 is for normalization
        wei =  tf.matmul(k, tf.transpose(k, perm=[0, 2, 1]))  * tf.math.rsqrt(tf.cast(C, tf.float32)) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = tf.where(tf.linalg.band_part(tf.ones((T, T)), -1, 0) == 0, tf.constant(float("-inf"), shape=(B, T, T)), wei) # (B, T, T)
        wei = tf.nn.softmax(wei, axis=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        x = self.ln_st(x)
        v = self.value(x) # (B,T,C)
        out = tf.matmul(wei, v) # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

    def build(self, input_shape):
        st_emb, pos_emb = input_shape
        self.ln_st.build(st_emb)
        self.ln_pos.build(pos_emb)
        super(Head, self).build((2, None, None, None))

In [59]:
# Layer with multiple self-attention Heads for data communication 
class MultiHeadAttention(layers.Layer):
    """ multiple heads of self-attention in parallel """
    def __init__(self, batch_size, block_size, state_dim, num_heads, head_size, dropout):
        super().__init__()

        self.heads = []
        for _ in range(num_heads):
            head = Head(batch_size, block_size, state_dim, head_size, dropout)
            head.build(((None, None, state_dim), (None, None, state_dim)))
            self.heads.append(head)

        # this linear layer is used to 'merge' the multiple heads acquired knowledge
        self.proj = layers.Dense(units=state_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer())
        self.dropout = layers.Dropout(dropout)

    #--------------------------------------------------
    def call(self, inp, training=False):
        # concatenate the heads outputs in the C dimension
        out =  tf.concat([h(inp) for h in self.heads], axis=-1)
        # apply the projection and the dropout
        out = self.dropout(self.proj(out))
        return out
    
    def build(self, input_shape):
        super(MultiHeadAttention, self).build((2, None, None, None))

In [60]:
#Simple feed forward for data computation
class FeedForward(layers.Layer):
    def __init__(self, state_dim, dropout, last_resize=True, spread_dim=None):
        super().__init__()
        last_layer = [
            layers.Dense(state_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer()), 
            layers.Dropout(dropout)
        ] if last_resize else []
        
        self.net = models.Sequential([
            layers.Dense(spread_dim if spread_dim is not None else 4 * state_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer()),
            layers.Dropout(dropout),
            *last_layer
        ])

    #--------------------------------------------------
    def call(self, x):
        return self.net(x)

In [61]:
# Block containing a multi head attention module and a feed forward linear computation
class Block(layers.Layer):
    """ Transformer block: communication followed by computation """
    def __init__(self, batch_size, block_size, emb_dim, num_heads, dropout, last_resize, spread_dim):
        super().__init__()
        self.last_resize = last_resize
        head_size = emb_dim // num_heads # each head gets a portion of the embeddings so different relations can be learned
        self.sa = MultiHeadAttention(batch_size, block_size, emb_dim, num_heads, head_size, dropout)
        self.ffwd = FeedForward(emb_dim, dropout, last_resize, spread_dim)

    #--------------------------------------------------
    def call(self, inp, training=False):
        st_emp, pos_emb = inp[0], inp[1]
        # Multi head attention with layer norm
        x = st_emp + pos_emb + self.sa([st_emp, pos_emb])
        # feed forward with layer norm
        x = (x + self.ffwd(x)) if self.last_resize else self.ffwd(x)
        return x
    
    def build(self, input_shape):
        st_emb, pos_emb = input_shape
        self.sa.build(input_shape)
        self.ffwd.build(st_emb)
        super(Block, self).build((2, None, None, None))

In [62]:
f_value = lambda : initializers.RandomNormal(mean=0.0, stddev=0.05)

In [63]:
class GPTModel(models.Model):
    def __init__(self, n_layer, batch_size, block_size, embedding_dim, out_dim, num_heads, dropout, ffw):
        super().__init__()
        self.block_size = block_size
        self.state_embedding = layers.Dense(units=embedding_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer())
        self.position_embedding = layers.Dense(units=embedding_dim, kernel_initializer=gpt_kernel_initializer(), bias_initializer=gpt_bias_initializer())
        self.embedding_dim = embedding_dim

        self.blocks = []
        for i in range(n_layer):
            block = Block(batch_size, block_size, embedding_dim, num_heads, dropout,
                last_resize = (i != n_layer - 1 ),  
                spread_dim = out_dim if (i == n_layer - 1 ) else None,
            )
            block.build(((None, None, embedding_dim), (None, None, embedding_dim)))
            self.blocks.append(block)

        self.ffw = ffw

    #--------------------------------------------------
    def call(self, inp, training=False):
        inputs, positions = inp[0], inp[1]
        B, T, C = inputs.shape
        if(T is None): T = self.block_size
        pos_emb = self.position_embedding(positions)
        st_emb = self.state_embedding(inputs)
        for block in self.blocks: st_emb = block((st_emb, pos_emb))
        logits = self.ffw(st_emb)
        return logits
    
    #--------------------------------------------------
    def generate(self, states, positions):
        # crop idx to the last block_size tokens
        idx_cond = states[:, -self.block_size:, :]
        pos_cond = positions[:, -self.block_size:, :]
        # get the predictions
        actions = self([idx_cond, pos_cond])
        # focus only on the last time step
        return actions
    
    #--------------------------------------------------
    def build(self, input_shape):
        states, positions = input_shape
        self.position_embedding.build(positions)
        self.state_embedding.build(states)
        super(GPTModel, self).build((2, None, None, None))

In [64]:
class Actor(object):
    def __init__(self, n_layer, batch_size, block_size, state_dim, action_dim, position_dim, embedding_dim, num_heads, dropout, action_range, lr, tau):
        #Network dimensions
        self.inp_dim = state_dim
        self.action_dim = action_dim
        ffw = lambda: models.Sequential([
            layers.Dense(32,  activation='relu', kernel_initializer=f_value(), bias_initializer=f_value()),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(action_dim, activation='tanh', kernel_initializer=f_value(), bias_initializer=f_value()),
            layers.Lambda(lambda i: i * action_range, dtype='float64'),
        ]) 

        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau

        #Generates the optimization function - used in the agent to generate gradients
        self.optimizer = optimizers.Adam(lr)

        #Generates the actor model
        self.model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=128,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.model.build(((None, None, state_dim), (None, None, position_dim)))

        #Generates the actor target model
        self.target_model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=128,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.target_model.build(((None, None, state_dim), (None, None, position_dim)))

        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
    
    #--------------------------------------------------------------------
    def predict(self, states, positions):
        return self.model.generate(states, positions)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, positions):
        return self.target_model.generate(states, positions)
    
    #--------------------------------------------------------------------
    def act(self, states, positions):
        action = self.predict(states, positions)
        # Gets the last action only
        action = action[0, -1, :]
        return action

    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save(path + '_actor_model.h5')
        self.target_model.save(path + '_actor_target_model.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.target_model = models.load_model(path)
        self.model = models.load_model(path)

In [65]:
class Critic(object):
    def __init__(self, n_layer, batch_size, block_size, state_dim, action_dim, position_dim, embedding_dim, out_dim, num_heads, dropout, lr, tau):
        #Network dimensions
        self.inp_dim = state_dim + action_dim
        ffw = lambda: models.Sequential([
                layers.Dense(32,  activation='relu', kernel_initializer=f_value(), bias_initializer=f_value()),
                layers.BatchNormalization(),
                layers.Dropout(dropout),
                layers.Dense(out_dim, activation='linear', kernel_initializer=f_value(), bias_initializer=f_value()),
            ]) 

        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau

        #Generates the optimization function - used in the agent to generate gradients
        self.optimizer = optimizers.Adam(lr)

        #Generates the actor model
        self.model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=64,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.model.build(((None, None, self.inp_dim), (None, None, position_dim)))

        #Generates the actor target model
        self.target_model = GPTModel(
            n_layer=n_layer,
            batch_size=batch_size, 
            block_size=block_size, 
            embedding_dim=embedding_dim, 
            out_dim=64,
            num_heads=num_heads,
            dropout=dropout,
            ffw = ffw(),
        )
        self.target_model.build(((None, None, self.inp_dim), (None, None, position_dim)))

        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
    
    #--------------------------------------------------------------------
    def predict(self, states, actions, positions):
        states = tf.cast(states, tf.float32) 
        actions = tf.cast(actions, tf.float32) 
        positions = tf.cast(positions, tf.float32)
        inp = tf.concat([states, actions], 2)
        return self.model.generate(inp, positions)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, actions, positions):
        states = tf.cast(states, tf.float32) 
        actions = tf.cast(actions, tf.float32)
        positions = tf.cast(positions, tf.float32)
        inp = tf.concat([states, actions], 2)
        return self.target_model.generate(inp, positions)
    
    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save(path + '_critic_model.h5')
        self.target_model.save(path + '_critic_target_model.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.target_model = models.load_model(path)
        self.model = models.load_model(path)

In [66]:
def calc_closest_factors(c: int):
    if c//1 != c: raise TypeError("c must be an integer.")

    a, b, i = 1, c, 0
    while a < b:
        i += 1
        if c % i == 0:
            a = i
            b = c//a
    
    return [b, a]

In [67]:
class DDPG_GPT_Agent(object):
    def __init__(self, a_n_layers, c_n_layers, batch_size, block_size, state_dim, action_dim, a_n_heads, c_n_heads, 
        dropout, action_min, action_max, memory_size, gamma, a_lr, c_lr, tau, epsilon, epsilon_decay, 
        epsilon_min, position_dim, a_embedding_dim, c_embedding_dim, gamma_grow, gamma_max
        ):
        
        self.block_size = block_size
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_min = action_min
        self.action_max = action_max
        self.gamma_grow = gamma_grow
        self.gamma_max = gamma_max
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.position_dim = position_dim

        self.episode_batch_size, self.steps_batch_size = calc_closest_factors(batch_size)

        #Creates the Replay Buffer
        self.memory = ReplayBuffer(memory_size, self.episode_batch_size)

        #Creates the noise generator
        self.ou_noise = OUActionNoise(mean=np.zeros(action_dim))

        #Creates the actor
        self.actor = Actor(
            n_layer=a_n_layers,
            batch_size=batch_size, 
            block_size=block_size, 
            state_dim=state_dim, 
            action_dim=action_dim, 
            position_dim=position_dim,
            embedding_dim=a_embedding_dim,
            num_heads=a_n_heads, 
            dropout=dropout, 
            action_range=action_max, 
            lr=a_lr, 
            tau=tau,
        )
        
        #Creates the critic
        self.critic = Critic(
            n_layer=c_n_layers,
            batch_size=batch_size, 
            block_size=block_size, 
            state_dim=state_dim, 
            action_dim=action_dim, 
            position_dim=position_dim,
            embedding_dim=c_embedding_dim,
            out_dim=1,
            num_heads=c_n_heads, 
            dropout=dropout, 
            lr=c_lr, 
            tau=tau,
        )
    
    #--------------------------------------------------------------------     
    def act(self, env):
        step = 1
        done = False

        states = env.reset().reshape(1, 1, -1)
        positions = self.int_to_bin(step).reshape(1, 1, -1)

        while not done:
            env.render()
            # Generating actions for a given group of states 
            action = self.policy(states, positions, explore=False)
            # Apply the action in the environment
            new_state, reward, done, info = env.step(action)
            step += 1
            # Append the new state to the states history
            states = tf.concat((states, new_state.reshape(1, 1, -1)), axis=1)
            positions = tf.concat((positions, self.int_to_bin(step).reshape(1, 1, -1)), axis=1)
        
        return

    #-------------------------------------------------------------------- 
    def policy(self, states, positions, explore=True):
        """ Generates an action from a group of states and add exploration """
        # gets the action
        action = self.actor.act(states, positions)
        # takes the exploration with the epsilon probability
        if explore and np.random.rand() < self.epsilon: action += self.ou_noise()
        # clip the action to be between min and max values
        action = np.clip(action, a_min=self.action_min, a_max=self.action_max)
        action[np.isnan(action)] = 0

        return action
    
    #-------------------------------------------------------------------- 
    def int_to_bin(self, num):
        gray = bin((num>>1) ^ num)[2:][::-1]
        bin_r = np.array([b for b in gray])
        bin_arr = np.zeros(self.position_dim)
        bin_arr[np.where(bin_r == '1')] = 1
        return bin_arr
    
    #--------------------------------------------------------------------
    def record_memories(self, steps):
        mem_fix = 1
        steps = np.array(steps, dtype=object)
        
        if (len(steps) > 1) and (np.random.rand() >= mem_fix):
            # Takes the weighted average of the numerical data with the weights being the rewards absolute value
            avg = np.average(steps[:, :-1], axis=0, weights=np.absolute(steps[:, 2]))
            # Check if the mean state is a done state - done states are now added separately to avoid information loss
            done = np.any(steps[:, -1]) 
            # Make the average step into an integer
            avg[4] = np.floor(avg[4])
            # creates the new average steps representation 
            steps = np.array([np.concatenate([avg, [done]])], dtype=object)
        
        step_bin = np.array([self.int_to_bin(int(st)) for st in steps[:, 4]])
        new_step_bin = np.array([self.int_to_bin(int(st + 1)) for st in steps[:, 4]])
        steps = np.delete(steps, 4, 1)
        steps = np.array([[*step, bin_r, n_bin_r] for step, bin_r, n_bin_r in zip(steps, step_bin, new_step_bin)], dtype=object) 
        self.memory.append(steps)
        return
    
    #--------------------------------------------------------------------
    def learn(self, memory_steps):
        """ Append an experience to the memory and replay memory if possible """
        self.record_memories(memory_steps)
        if self.memory.hasMinLength: self.replay_memory()
        return
    
    #--------------------------------------------------
    def episode_to_batch(self, episode):
        get_memory_element = lambda i, batch: np.array([[mem[i] for mem in block] for block in batch])

        if len(episode) > (self.block_size + self.steps_batch_size):
            steps_idxs = np.random.choice(np.arange(self.block_size, len(episode)), size=self.steps_batch_size-1, replace=False)
            steps_idxs = np.append(steps_idxs, len(episode))
        else: steps_idxs = np.arange(self.block_size, len(episode))
        
        batch = np.array([episode[i-self.block_size:i] for i in steps_idxs])
        return (np.array(get_memory_element(el, batch)) for el in range(len(episode[0])))

    #--------------------------------------------------
    def null_step(self, step):
        step[2] = 0
        return step

    #--------------------------------------------------
    def episode_pad(self, episode):
        return np.concatenate((episode, [self.null_step(episode[-1]) for _ in range(self.block_size - len(episode) + 1)]))
    
    #--------------------------------------------------
    def get_episodes_batches(self, episodes):
        states = None
        actions = None
        next_states = None
        positions = None
        next_positions = None
        rewards = None
        done = None

        select_value = lambda final_value, aux_value: aux_value if final_value is None else np.concatenate((final_value, aux_value))
        to_tensor = lambda value: tf.convert_to_tensor(value, dtype='float32')

        for episode in episodes:
            if len(episode) <= self.block_size: episode = self.episode_pad(episode)
            st_aux, ac_aux, rw_aux, nst_aux, dn_aux, ps_aux, nps_aux = self.episode_to_batch(episode)
            states = select_value(states, st_aux)
            actions = select_value(actions, ac_aux) 
            next_states = select_value(next_states, nst_aux)
            positions = select_value(positions, ps_aux) 
            next_positions = select_value(next_positions, nps_aux)
            rewards = select_value(rewards, rw_aux)
            done = select_value(done, dn_aux) 

        states = to_tensor(states)
        actions = to_tensor(actions)
        next_states = to_tensor(next_states)
        positions = to_tensor(positions)
        next_positions = to_tensor(next_positions)
        rewards = to_tensor(tf.expand_dims(tf.cast(rewards, dtype='float32'), axis=-1))
        done = to_tensor(tf.expand_dims(tf.cast(done, dtype='float32'), axis=-1))

        return states, actions, next_states, positions, next_positions, rewards, done

    #--------------------------------------------------------------------    
    def replay_memory(self):
        """ Replay a batch of memories """

        # Get sample block from the replay buffer
        episodes = self.memory.getEpisodes()
        states, actions, next_states, positions, next_positions, rewards, done = self.get_episodes_batches(episodes)

        #Train the critic
        with tf.GradientTape() as tape:
            # Compute the actor target actions
            target_actions = self.actor.target_predict(next_states, next_positions)
            # Compute the critic target values - TODO: Use real return instead
            predicted_return = self.critic.target_predict(next_states, target_actions, next_positions)
            # The return for the last block element
            last_return = predicted_return[:, -1, :]

            # Compute the gamma tensor based on the block step
            gamma_values = lambda i: tf.expand_dims(tf.repeat([[self.gamma**(k - i) for k in range(i, rewards.shape[1])]], repeats=rewards.shape[0], axis=0), axis=-1)
            # Compute the gamma weighted reward for a given block step
            weighted_next_rewards = lambda i: tf.math.reduce_sum(rewards[:, i:, :] * gamma_values(i), axis=1)
            # The gamma weight for the last return bootstrap
            last_return_weight = lambda i: self.gamma ** (rewards.shape[1] - i)
            # Compute the done value for a block step
            state_done = lambda i: 1 - done[:, i, :]
            
            # Compute the return target values
            y = tf.stack([(weighted_next_rewards(i) + (last_return_weight(i) * last_return * state_done(i))) for i in range(rewards.shape[1])], axis=1)
            #y = rewards + self.gamma * self.critic.target_predict(next_states, target_actions, next_positions) * (1 - done) # y = (B, T, 1)
            
            # Predict the expected reward associated with taking the target predicted action in the state
            critic_value = self.critic.predict(states, actions, positions)
            # Compute the critic loss  
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
            
        critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(critic_grad, self.critic.model.trainable_variables))
        
        #Train the actor
        with tf.GradientTape() as tape:
            acts = self.actor.predict(states, positions)
            critic_grads = self.critic.predict(states, acts, positions)
            #Used -mean as we want to maximize the value given by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_grads)

        actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grad, self.actor.model.trainable_variables))
            
        #Update the model weights
        self.actor.transferWeights()
        self.critic.transferWeights() 
        
    #--------------------------------------------------
    def print_data(self, verbose, episode, step, score):
        if verbose:
            print("\r                                                                                                     ", end="")
            print("\rEpisode: "+str(episode+1)+"\t Step: "+str(step)+"\tReward: "+str(score) ,end="")
        return

    #--------------------------------------------------------------------     
    def train(self, env, num_episodes, step_per_train, verbose, verbose_num, end_on_complete=False, complete_num=1, complete_value=float('inf'), act_after_batch=False):
        scores_history = []
        steps_history = []
        complete = 0
        print("BEGIN\n")
        
        for episode in range(num_episodes):
            done = False
            score, step = 0, 1
            state = env.reset()
            states = state.reshape(1, 1, -1)
            positions = self.int_to_bin(step).reshape(1, 1, -1)
            
            while not done:
                done_step = []
                memory_steps = []

                while not done and (step % step_per_train != 0):
                    action = self.policy(states, positions)
                    self.print_data(verbose, episode, step, score)
                    new_state, reward, done, _ = env.step(action)
                    memory_steps.append([state, action, reward, new_state, step, int(done)])
                    state = new_state
                    step += 1
                    states = tf.concat((states, new_state.reshape(1, 1, -1)), axis=1)
                    positions = tf.concat((positions, self.int_to_bin(step).reshape(1, 1, -1)), axis=1)
                    score += reward
                
                step += 1
                if len(memory_steps) > 0: self.learn(memory_steps)
                self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
                self.gamma = min(self.gamma_max, self.gamma*self.gamma_grow)

            scores_history.append(score)
            steps_history.append(step)
            
            #If the score is bigger or equal than the complete score it add one to the completed number
            if(score >= complete_value):
                complete += 1
                #If the flag is true the agent ends the trainig on the firs complete episode
                if end_on_complete and complete >= complete_num: break
            
            #These information are printed after each verbose_num episodes
            if((episode+1)%verbose_num == 0):
                print("\r                                                                                                          ", end="")
                print("\rEpisodes: ", episode+1, "/", num_episodes, 
                      "\n\tTotal reward: ", round(np.mean(scores_history[-verbose_num:]), 2), '+/-', round(np.std(scores_history[-verbose_num:]), 2), 
                      "\n\tNum. steps: ", round(np.mean(steps_history[-verbose_num:]), 2), '+/-', round(np.std(steps_history[-verbose_num:]), 2), 
                      *["\n\tCompleted: ", complete] if complete_value != float('inf') else '', 
                      "\n--------------------------",
                    )
                
                #If the flag is true the agent act and render the episode after each verbose_num episodes
                if act_after_batch: self.act(env)
                
                #Set the number of completed episodes on the batch to zero
                complete = 0

        print("\nFINISHED")
        
        return scores_history, steps_history
    #--------------------------------------------------------------------     
    def save(self, path):
        self.actor.saveModel(path)
        self.critic.saveModel(path)
    
    #--------------------------------------------------------------------
    def load(self, a_path, c_path):
        self.actor.loadModel(a_path)
        self.critic.loadModel(c_path)

In [68]:
env = gym.make("LunarLander-v2", continuous=True, max_episode_steps=500)
batch_size = 128
block_size = 64
position_dim = 11
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_min = env.action_space.low
action_max = env.action_space.high
dropout = 0.05
memory_size = 300

gamma = 0.7
gamma_grow = 1+3e-3
gamma_max = 0.99

epsilon = 1
epsilon_decay = 0.9999
epsilon_min = 0.3

tau =  5e-4

# Actor hyperparameter
a_n_layer = 1
a_num_heads = 1
a_embedding_dim = 4
a_learning_rate = 6e-4

# Critic hyperparameter
c_n_layer = 1
c_num_heads = 1
c_embedding_dim = 8
c_learning_rate = 1e-3

agent = DDPG_GPT_Agent(
    a_n_layers = a_n_layer,
    c_n_layers = c_n_layer, 
    batch_size = batch_size, 
    block_size=block_size, 
    state_dim=state_dim, 
    action_dim=action_dim, 
    a_embedding_dim=a_embedding_dim,
    c_embedding_dim=c_embedding_dim,
    a_n_heads=a_num_heads, 
    c_n_heads=c_num_heads,
    dropout=dropout, 
    action_min=action_min, 
    action_max=action_max, 
    memory_size=memory_size, 
    gamma=gamma, 
    a_lr=a_learning_rate, 
    c_lr=c_learning_rate, 
    tau=tau, 
    epsilon=epsilon, 
    epsilon_decay=epsilon_decay, 
    epsilon_min=epsilon_min,
    position_dim=position_dim,
    gamma_grow=gamma_grow,
    gamma_max=gamma_max,
)

  logger.warn(


In [69]:
num_episodes = 5000
step_per_train = 2
verbose = True
verbose_num = 10
act_after_batch = True

scores, steps = agent.train(
    env=env, 
    num_episodes=num_episodes,
    step_per_train=step_per_train,
    verbose=verbose, 
    verbose_num=verbose_num,  
    act_after_batch=act_after_batch,
)
env.close()

BEGIN

Episodes:  10 / 5000                                                                                      
	Total reward:  -432.16 +/- 216.02 
	Num. steps:  195.8 +/- 96.17 
--------------------------
Episodes:  20 / 5000                                                                                      
	Total reward:  -479.34 +/- 245.41 
	Num. steps:  145.8 +/- 28.53 
--------------------------
Episodes:  30 / 5000                                                                                      
	Total reward:  -436.65 +/- 122.73 
	Num. steps:  132.2 +/- 13.57 
--------------------------
Episodes:  40 / 5000                                                                                      
	Total reward:  -474.15 +/- 153.71 
	Num. steps:  163.0 +/- 34.89 
--------------------------
Episodes:  50 / 5000                                                                                      
	Total reward:  -534.63 +/- 263.44 
	Num. steps:  165.8 +/- 74.37 
---------------

In [None]:
env = gym.make("LunarLander-v2", continuous=True,  max_episode_steps=500)
agent.act(env)
env.close()

In [None]:
import matplotlib.pyplot as plt
plt.plot(scores, np.arange(len(scores)))

: 

In [None]:
env = gym.make("LunarLander-v2", continuous=True,  max_episode_steps=500)
step = 1
done = False
states = env.reset().reshape(1, 1, -1)
positions = agent.int_to_bin(step).reshape(1, 1, -1)

while not done:
    env.render()
    # Generating actions for a given group of states 
    # gets the action
    action = agent.actor.target_predict(states, positions)
    # Gets the last action only
    action = action[0, -1, :]
    # clip the action to be between min and max values
    action = np.clip(action, a_min=action_min, a_max=action_max)
    action[np.isnan(action)] = 0
    # Apply the action in the environment
    new_state, reward, done, info = env.step(action)
    step += 1
    # Append the new state to the states history
    states = tf.concat((states, new_state.reshape(1, 1, -1)), axis=1)
    positions = tf.concat((positions, agent.int_to_bin(step).reshape(1, 1, -1)), axis=1)

env.close()

: 