# Required libraries 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import gym
import time
import os
import matplotlib.pyplot as plt

from tensorflow import GradientTape, random_uniform_initializer, square, subtract, sqrt
from tensorflow.math import reduce_mean, square
from tensorflow.keras.layers import Input, Dense, Concatenate, Lambda, BatchNormalization, Conv2D, MaxPooling2D, UpSampling2D, Flatten, Dropout, GlobalAveragePooling2D, Reshape, Normalization
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing .image import smart_resize
from tensorflow.keras.utils import plot_model
from tensorflow.random import set_seed

set_seed(42)
np.random.seed(42)

  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):


# Action noise generator

In [2]:
# Ornstein-Uhlenbeck Noise 
class OUActionNoise(object):
    def __init__(self, mean, sigma=0.5, theta=0.2, dt=0.1, x0=None):
        self.mean = mean
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()
    
    #--------------------------------------------------------------------------------
    # Method that enables to write classes where the instances behave like functions and can be called like a function.    
    def __call__(self):
        x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        self.x_prev = x
        
        return x
    
    #--------------------------------------------------------------------------------
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mean)

# Replay Buffer

In [3]:
# Replay Buffer 
class ReplayBuffer(object):
    def __init__(self, size, minibatch_size = None):
        '''
        Args:
            size (integer): The size of the replay buffer.              
            minibatch_size (integer): The sample size.
        '''
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState()
        self.max_size = size
        
    #--------------------------------------------------------------------------------    
    def append(self, state, action, reward, next_state, sensorial_data, embedding, done):
        '''
        Args:
            state (Numpy array): The state.              
            action (integer): The action.
            reward (float): The reward.
            done (boolen): True if the next state is a terminal state and False otherwise.
                           Is transformed to integer so tha True = 1, False = 0
            next_state (Numpy array): The next state.           
        '''
        # Has a 80% chance of registering the memory
        if self.hasMin and np.random.uniform() > 0.8: return
        if self.size() == self.max_size: del self.buffer[0]
        self.buffer.append([state, action, reward, next_state, sensorial_data, embedding, int(done)])
        
    #--------------------------------------------------------------------------------    
    def sample(self):
        '''
        Returns:
            A list of transition tuples including state, action, reward, terminal, and next_state
        '''
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]
    
    #--------------------------------------------------------------------------------    
    def size(self):
        '''
        Returns:
            Number of elements in the buffer
        '''
        return len(self.buffer)
    
    #--------------------------------------------------------------------------------
    @property
    def hasMin(self):
        '''
        Returns:
            Boolean indicating if the memory have the minimum number of elements or not
        '''
        return (self.size() >= self.minibatch_size)
    
    #--------------------------------------------------------------------------------
    def empties(self):
        self.buffer.clear()
    
    #--------------------------------------------------------------------------------
    def getEpisode(self):
        '''
        Returns:
            List with all the elements in the buffer
        '''
        return self.buffer
        

# Layers creation organizers

In [4]:
class ConvLayerData(object):
    def __init__(self, num_channels, kernel_dim, strides, pooling_dim, droppout):
        self.num_channels = num_channels
        self.kernel_dim = kernel_dim
        self.strides = strides
        self.pooling_dim = pooling_dim
        self.droppout = droppout

        
#==============================================================

class DenseLayerData(object):
    def __init__(self, dim, droppout, normalize=True):
        self.dim = dim
        self.droppout = droppout
        self.normalize = normalize

# Actor

In [5]:
class Actor(object):
    def __init__(self, state_dim, sensory_dim, action_dim, lr, tau):
        
        #Network dimensions
        self.state_dim = state_dim
        self.sensory_dim = sensory_dim
        self.action_dim = action_dim
        
        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau
        
        #Optimizer learning rate
        self.lr = lr
        
        # Encoder layers data
        self.sensory_net_dims = [
            ConvLayerData(8, 2, 2, 0, 0),
            ConvLayerData(16, 3, 2, 2, 0.05),
            ConvLayerData(24, 2, 1, 2, 0),
        ]
        
        self.state_net_dims = [
            DenseLayerData(32, 0.1),
            DenseLayerData(8, 0.1, False),
        ]
        
        self.actor_dims = [
            DenseLayerData(256, 0.05, False),
            DenseLayerData(128, 0.07),
            DenseLayerData(128, 0.07, False),
            DenseLayerData(16, 0),
        ]

        #Generates the optimization function
        self.optimizer = Adam(learning_rate=self.lr)
        
        #Generates the actor model
        self.model = self.buildNetwork()
        
        #Generates the actor target model
        self.target_model = self.buildNetwork()
        
        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
    
    #--------------------------------------------------------------------
    
    def sensoryEncoderNet(self):
        return Actor.createSensoryEncoderNet(self.sensory_dim, self.sensory_net_dims)    

    #--------------------------------------------------------------------
    
    def stateNet(self):
        return Actor.createStateNet(self.state_dim, self.state_net_dims)

    #--------------------------------------------------------------------
    
    def buildNetwork(self):
        sensory_encoder = self.sensoryEncoderNet()
        state_net = self.stateNet() 
        
        return Actor.createNetwork(self.sensory_net_dims, sensory_encoder, state_net, self.actor_dims, self.optimizer, self.action_dim)

    #--------------------------------------------------------------------
    
    def predict(self, states, sensory_data):
        return self.model([sensory_data, states], training=False)
    
    #--------------------------------------------------------------------
    
    def target_predict(self, states, sensory_data):
        return self.target_model([sensory_data, states], training=False)
    
    #--------------------------------------------------------------------
    
    @property
    def embeddingDim(self):
        return self.model.output[-1].shape[-1]
        
    #--------------------------------------------------------------------
    
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    
    def saveModel(self, path):
        self.model.save_weights(path + 'actor.h5')
    
    #--------------------------------------------------------------------
    
    def loadModel(self, path):
        self.model.load_weights(path)
        self.target_model.load_weights(path)
        
    # GENERAL STATIC FUNCTIONS FOR SINGLE NETWORK CREATION -------------------------------------------------
    
    def createSensoryEncoderNet(sensory_data_dim, sensory_net_dims):
        inp = Input(shape=(sensory_data_dim[0], sensory_data_dim[1], 3), name='SensoryEnc_Input')
        net = Normalization(axis=None)(inp)

        for i, layer in enumerate(sensory_net_dims):
            dist_lim = 1 / np.sqrt(layer.num_channels*(layer.kernel_dim**2))
            net = Conv2D(
                layer.num_channels, 
                layer.kernel_dim, 
                layer.strides, 
                padding='same', 
                activation='relu', 
                kernel_initializer=random_uniform_initializer(-dist_lim, dist_lim), 
                bias_initializer=random_uniform_initializer(-dist_lim, dist_lim),
                name = 'SensoryEnc_Dense_' + str(i+1),
            )(net)
            if(layer.pooling_dim > 0): net = MaxPooling2D(layer.pooling_dim, strides=2, name = 'SensoryEnc_MaxPooling_' + str(i+1))(net)
            if(layer.droppout > 0): net = Dropout(layer.droppout, name = 'SensoryEnc_Droppout_' + str(i+1))(net)

        net = Flatten(name = 'SensoryEnc_Flatten')(net)

        model = Model(inputs=[inp], outputs=[net])

        return model

    #--------------------------------------------------------------------
    def createSensoryDecoderNet(sensory_net_dims, sensory_enc):
        net = Reshape(target_shape=sensory_enc.layers[-2].output.shape[1:], name='SensoryDec_Reshape')(sensory_enc.output)

        for i, layer in enumerate(reversed(sensory_net_dims)):
            net = Conv2D(
                layer.num_channels, 
                layer.kernel_dim, 
                layer.strides, 
                padding='same', 
                activation='relu',
                name = 'SensoryDec_Dense_' + str(i+1),
            )(net)
            net = UpSampling2D(layer.pooling_dim, name = 'SensoryDec_UpSampling_' + str(i+1))(net)
            if(layer.droppout > 0): net = Dropout(layer.droppout, name = 'SensoryDec_Droppout_' + str(i+1))(net)

        net = Conv2D(3, 3, strides=1, padding='same', activation='relu', name = 'SensoryDec_LastConv')(net)
        net = UpSampling2D(4, name = 'SensoryDec_Output')(net)

        model = Model(inputs=[sensory_enc.input], outputs=[net])

        return model

    #--------------------------------------------------------------------
    def createStateNet(state_dim, state_net_dims):
        inp = Input(shape=(state_dim, ), name='State_Input')
        net = Lambda(lambda x: x, name='State_Lambda')(inp)

        for i, layer in enumerate(state_net_dims):
            dist = 1 / np.sqrt(layer.dim)
            net = Dense(
                layer.dim,
                activation='relu', 
                dtype='float32', 
                kernel_initializer=random_uniform_initializer(-dist, dist), 
                bias_initializer=random_uniform_initializer(-dist, dist),
                name='State_Dense_' + str(i)
            )(net)
            if(layer.normalize): net = BatchNormalization(dtype='float32', name='State_BatchNorm_' + str(i))(net)
            if(layer.droppout > 0): net = Dropout(layer.droppout, name='State_Droppout_' + str(i))(net)

        model = Model(inputs=[inp], outputs=[net])

        return model

    #--------------------------------------------------------------------
    def createNetwork(sensory_net_dims, sensory_enc, state_net, net_layers, optimizer, action_dim):
        conc = Concatenate(dtype='float32')([sensory_enc.output, state_net.output])
        net = Lambda(lambda x: x, name='Actor_Lambda')(conc)
        
        for i, layer in enumerate(net_layers):
            dist = 1 / np.sqrt(layer.dim)
            net = Dense(
                layer.dim, 
                activation='relu', 
                dtype='float32', 
                kernel_initializer=random_uniform_initializer(-dist, dist), 
                bias_initializer=random_uniform_initializer(-dist, dist),
                name='Actor_Dens_' + str(i)
            )(net)
            if(layer.normalize): net = BatchNormalization(dtype='float32', name='Actor_BatchNorm_' + str(i))(net)
            if(layer.droppout > 0): net = Dropout(layer.droppout, name='Actor_Droppout_' + str(i))(net)

        net = Dense(action_dim, activation='softmax', dtype='float32', name='Actor_Out')(net)

        #dec = Actor.createSensoryDecoderNet(sensory_net_dims, sensory_enc)
        model = Model(inputs=[sensory_enc.input, state_net.input], outputs=[net, sensory_enc.output])

        model.compile(optimizer=optimizer, loss='mse')

        return model

# Testing Actor networks creators

In [6]:
%%script false --no-raise-error
sensory_net_dims = [
    ConvLayerData(20, 4, 2, 2, 0.1),
    ConvLayerData(30, 3, 1, 2, 0.2),
    ConvLayerData(40, 2, 1, 2, 0),
    ConvLayerData(50, 2, 1, 2, 0),
]

state_net_dim = [
    DenseLayerData(32, 0.1),
]

actor_dims = [
    DenseLayerData(512, 0.1),
    DenseLayerData(128, 0.2),
    DenseLayerData(16, 0.1),
]

sensory_dim = [42, 64]
state_dim = 16

optimizer = Adam(learning_rate=0.1)
action_dim = 4
act_range = [1, 1, 1, 1]

In [7]:
%%script false --no-raise-error
ne = Actor.createSensoryEncoderNet(sensory_dim, sensory_net_dims)
nd = Actor.createSensoryDecoderNet(sensory_net_dims, ne)
ns = Actor.createStateNet(state_dim, state_net_dim)

In [8]:
%%script false --no-raise-error
ne.summary()

In [9]:
%%script false --no-raise-error
nf = Actor.createNetwork(sensory_net_dims, ne, ns, actor_dims, optimizer, action_dim, act_range)

In [10]:
%%script false --no-raise-error
plot_model(nf, show_shapes=True)

In [11]:
%%script false --no-raise-error
act = Actor(state_dim, sensory_dim, action_dim, act_range, 0.1, 0.1)

In [12]:
%%script false --no-raise-error
plot_model(act.model, show_shapes=True)

In [13]:
%%script false --no-raise-error
states = [np.random.rand(1, state_dim)]
sensory = [np.random.rand(*sensory_dim, 3).reshape((-1, 256, 256, 3, 1))]
pred = act.predict(states, sensory)
print(len(pred))
print(pred[0][0].shape)
print(pred[1][0].shape)

# Critic

In [14]:
class Critic(object):
    def __init__(self, state_dim, embedding_dim, action_dim, out_dim, lr, tau):
        #Network dimensions
        self.state_dim = state_dim
        self.embedding_dim = embedding_dim
        self.action_dim = action_dim
        self.out_dim = out_dim
        
        self.embedding_net_dims = [
            DenseLayerData(256, 0.1),
            DenseLayerData(256, 0.2, False),
            DenseLayerData(64, 0),
        ]
        self.state_net_dims = [
            DenseLayerData(32, 0.1, False),
            DenseLayerData(8, 0.1),
        ]
        self.action_net_dims = [
            DenseLayerData(16, 0.1),
        ]
        
        self.critic_net_dims = [
            DenseLayerData(128, 0.1),
            DenseLayerData(32, 0.1),
            DenseLayerData(8, 0, False),
        ]
        #Optimizer learning rate
        self.lr = lr

        #Define the critic optimizer
        self.optimizer = Adam(learning_rate=self.lr)

        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau

        #Generate the critic network
        self.model = self.buildNetwork()

        #Generate the critic target network
        self.target_model = self.buildNetwork()
        
        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
    
    #--------------------------------------------------------------------
    
    def stateNet(self):
        return Critic.createGenericNet('State', self.state_dim, self.state_net_dims)
    
    #--------------------------------------------------------------------
    
    def actionNet(self):
        return Critic.createGenericNet('Action', self.action_dim, self.action_net_dims)
    
    #--------------------------------------------------------------------
    
    def embeddingNet(self):
        return Critic.createGenericNet('Emb', self.embedding_dim, self.embedding_net_dims)

    #--------------------------------------------------------------------
    
    def buildNetwork(self):
        state = self.stateNet()
        action = self.actionNet()
        emb = self.embeddingNet()
        
        return Critic.createNetwork(state, action, emb, self.critic_net_dims, self.out_dim, self.optimizer)
        
    #--------------------------------------------------------------------
    
    def predict(self, states, actions, embeddings):
        return self.model([states, actions, embeddings], training=False)
    
    #--------------------------------------------------------------------
    
    def target_predict(self, states, actions, embeddings):
        return self.target_model([states, actions, embeddings], training=False)
    
    #--------------------------------------------------------------------
    
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    
    def saveModel(self, path):
        self.model.save_weights(path + 'critic.h5')
    
    #--------------------------------------------------------------------
    
    def loadModel(self, path):
        self.model.load_weights(path)
        self.target_model.load_weights(path)
        
    # =================== GENERAL STATIC FUNCTIONS FOR SINGLE NETWORK CREATION ==========================
    
    def createGenericNet(net_type, data_dim, net_dims, normalize_input=False):
        inp = Input(shape=(data_dim, ), name='C_'+net_type+'_Input')
        if(normalize_input): net = Normalization(axis=None)(inp)
        else: net = Lambda(lambda x: x)(inp)

        for i, layer in enumerate(net_dims):
            dist = 1 / np.sqrt(layer.dim)
            net = Dense(
                layer.dim,
                activation='relu', 
                dtype='float32', 
                kernel_initializer=random_uniform_initializer(-dist, dist), 
                bias_initializer=random_uniform_initializer(-dist, dist),
                name='C_'+net_type+'_Dense_' + str(i)
            )(net)
            if(layer.normalize): net = BatchNormalization(dtype='float32', name='C_'+net_type+'_BatchNorm_' + str(i))(net)
            if(layer.droppout > 0): net = Dropout(layer.droppout, name='C_'+net_type+'_Droppout_' + str(i))(net)

        model = Model(inputs=[inp], outputs=[net])

        return model
    
    # =============================================
    
    def createNetwork(state_net, action_net, emb_net, critic_net_dims, out_dim, optimizer):
        #Concatenate state and action networks ---
        net = Concatenate(dtype='float32')([action_net.output, state_net.output])
        dim = round(action_net.output.shape[-1]*state_net.output.shape[-1])
        dist =  1 / np.sqrt(dim)
        net = Dense(
            dim, 
            activation='relu', 
            dtype='float32', 
            kernel_initializer=random_uniform_initializer(-dist, dist), 
            bias_initializer=random_uniform_initializer(-dist, dist),
            name='Critic_ActState_Dense',
        )(net)
        
        # Concatenates state-action and embedding networks
        net = Concatenate(dtype='float32')([net, emb_net.output])
        dim = round(np.sqrt((0.3*min(net.shape[-1], emb_net.output.shape[-1])) + (0.7*max(net.shape[-1], emb_net.output.shape[-1]))))
        dist =  1 / np.sqrt(dim)
        net = Dense(
            dim, 
            activation='relu', 
            dtype='float32', 
            kernel_initializer=random_uniform_initializer(-dist, dist), 
            bias_initializer=random_uniform_initializer(-dist, dist),
            name='Critic_ActStateEmb_Dense',
        )(net)
        
        for i, layer in enumerate(critic_net_dims):
            dist = 1 / np.sqrt(layer.dim)
            net = Dense(
                layer.dim,
                activation='relu', 
                dtype='float32', 
                kernel_initializer=random_uniform_initializer(-dist, dist), 
                bias_initializer=random_uniform_initializer(-dist, dist),
                name='C_Dense' + str(i)
            )(net)
            if(layer.normalize): net = BatchNormalization(dtype='float32', name='C_BatchNorm_' + str(i))(net)
            if(layer.droppout > 0): net = Dropout(layer.droppout, name='C_Droppout_' + str(i))(net)
        
        net = Dense(
            out_dim, 
            activation='relu', 
            dtype='float32',  
            kernel_initializer=random_uniform_initializer(-dist, dist), 
            bias_initializer=random_uniform_initializer(-dist, dist),
        )(net)
        
        model = Model(inputs=[state_net.input, action_net.input, emb_net.input], outputs=[net])

        model.compile(optimizer=optimizer, loss='mse')
        
        return model

# Testing Critic networks creators

In [15]:
%%script false --no-raise-error
emb_net_dims = [
    DenseLayerData(20, 0.1),
    DenseLayerData(30, 0.2),
   DenseLayerData(40, 0),
]
state_net_dims = [
    DenseLayerData(32, 0.1),
]
action_net_dims = [
    DenseLayerData(2042, 0.1),
    DenseLayerData(512, 0.1),
    DenseLayerData(128, 0),
]
critic_net_dims = [
    DenseLayerData(2042, 0.1),
    DenseLayerData(512, 0.1),
    DenseLayerData(128, 0),
]

emb_dim = 192
state_dim = 16

optimizer = Adam(learning_rate=0.1)
action_dim = 4

In [16]:
%%script false --no-raise-error
ns = Critic.createGenericNet('State', state_dim, state_net_dims)
na = Critic.createGenericNet('Action', action_dim, action_net_dims)
ne = Critic.createGenericNet('Emb', emb_dim, emb_net_dims)

In [17]:
%%script false --no-raise-error
nf = Critic.createNetwork(ns, na, ne, critic_net_dims, 1, optimizer)

In [18]:
%%script false --no-raise-error
plot_model(nf, show_shapes=True)

In [19]:
%%script false --no-raise-error
crt = Critic(state_dim, emb_dim, action_dim, 1, 0.1, 0.1)

In [20]:
%%script false --no-raise-error
plot_model(crt.model, show_shapes=True)

In [21]:
%%script false --no-raise-error
states = [np.random.rand(1, state_dim)]
actions = [np.random.rand(1, action_dim)]
emb = [np.random.rand(1, emb_dim)]
pred = crt.predict(states, actions, emb)
print(len(pred))
print(pred.shape)

# DDPG Agent

In [22]:
class DDPGAgent(object):
    def __init__(self, state_dim, num_actions, sensory_dim, memory_size, batch_size, gamma, a_lr, c_lr, tau, epsilon, epsilon_decay, epsilon_min):
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.sensory_dim = sensory_dim
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.a_lr = a_lr
        self.c_lr = c_lr
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        #Creates the Replay Buffer
        self.memory = ReplayBuffer(self.memory_size, self.batch_size)

        #Creates the actor
        self.actor = Actor(
            state_dim = self.state_dim,
            sensory_dim = self.sensory_dim,
            action_dim = self.num_actions, 
            lr = self.a_lr, 
            tau = self.tau,
        )

        #Creates the critic
        self.critic = Critic(
            state_dim = self.state_dim, 
            action_dim = self.num_actions, 
            embedding_dim = self.actor.embeddingDim,
            out_dim = 1,
            lr = self.c_lr, 
            tau = self.tau,
        )

    #-------------------------------------------------------------------- 
    
    def policy(self, state, sensorial_data, explore=True):
        state = state[np.newaxis, :]
        sensorial_data = sensorial_data[np.newaxis, :]
        pred = self.actor.predict(state, sensorial_data)
        
        action = np.argmax(pred[0][0])
        #Takes the exploration with the epsilon probability
        if explore and np.random.uniform() < self.epsilon: action = np.random.choice(np.where(np.arange(self.num_actions) != action)[0])
        
        return pred[0][0], action, pred[1]
    
    #--------------------------------------------------------------------
    
    def learn(self, state, action, reward, next_state, sensorial_data, embedding, done):
        self.memory.append(state, action, reward, next_state, sensorial_data, embedding, done)
        if self.memory.hasMin: self.replay_memory()
        
    #--------------------------------------------------------------------    
    
    def replay_memory(self):
        # Get sample experiences from the replay buffer
        experiences = self.memory.sample()
        
        #Get each term of the esxperiences
        states = np.array([exp[0] for exp in experiences])
        actions = np.array([exp[1] for exp in experiences])
        rewards = np.array([exp[2] for exp in experiences])
        next_states = np.array([exp[3] for exp in experiences])
        sensorial_data = np.array([exp[4] for exp in experiences])
        embeddings = np.array([exp[5] for exp in experiences])
        done = np.array([int(exp[6]) for exp in experiences])
        
        #Change the dimensions of the rewards and done arrays
        rewards = rewards[:, np.newaxis]
        done = done[:, np.newaxis]
        
        #Train the critic
        with GradientTape() as tape:
            #Compute the critic target values
            target_actions, sensorial_embeddings  = self.actor.target_predict(next_states, sensorial_data)
            y = rewards + self.gamma * self.critic.target_predict(next_states, target_actions, sensorial_embeddings) * (1 - done)
            #Compute the q_value of each next_state, next_action pair
            critic_value = self.critic.predict(states, actions, np.squeeze(embeddings))
            #Compute the critic loss 
            critic_loss = reduce_mean(square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(critic_grad, self.critic.model.trainable_variables))
        
        #Train the actor
        with GradientTape() as tape:
            acts, embs = self.actor.predict(states, sensorial_data)
            critic_grads = self.critic.predict(states, acts, embs)
            #Used -mean as we want to maximize the value given by the critic for our actions
            actor_loss = -reduce_mean(critic_grads)
            
        actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grad, self.actor.model.trainable_variables))
        
        #Update the model weights
        self.actor.transferWeights()
        self.critic.transferWeights() 
        
        #Decay the epsilon value
        if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay
        #If its reach the minimum value it stops
        else: self.epsilon = self.epsilon_min

        return
    
    #--------------------------------------------------------------------
    
    def act(self, env, verbose = False):
        observation = env.reset()
        done = False
        
        while not done:
            env.render(mode='human')
            time.sleep(0.02)
            sensory = self.getSensorialData(env.render(mode = 'rgb_array'))
            action = self.policy(observation, sensory, explore=False)[1]
            if verbose: print(action)
            new_observation, reward, done, info = env.step(action)
            observation = new_observation

        return
        
    #--------------------------------------------------------------------     
    def train(self, env, num_episodes, verbose, verbose_batch, end_on_complete, complete_num, complete_value, act_after_batch):
        scores_history = []
        steps_history = []

        total = 0
        
        #If the complete_num is smaller than 1 ist interpreted as a percentage else its a number of episodes
        if complete_num < 1: complete_num = int(complete_num*verbose_batch) if int(complete_num*verbose_batch) != 0 else 1
        
        #Begin the training
        print("BEGIN\n")
        
        #Number of completed episodes per batch
        complete = 0
        
        #Iterate on each episode
        for episode in range(num_episodes):
            done = False
            score = 0
            steps = 0
            observation = env.reset()
            
            while not done:
                sensory_data = env.render(mode='rgb_array')
                sensory_data = self.getSensorialData(sensory_data)
                
                probs, action, embedding = self.policy(observation, sensory_data)
                new_observation, reward, done, _ = env.step(action)
                self.learn(observation, probs, reward, new_observation, sensory_data, embedding, done)

                if verbose and total > 100:
                    print("\r                                                                                                              ", end="")
                    print("\rEpisode: "+str(episode+1)+"\t Step: "+str(steps)+"\tReward: "+str(score) ,end="")
                    
                observation = new_observation
                score += reward
                steps += 1
                total += 1

            scores_history.append(score)
            steps_history.append(steps)
            
            #If the score is bigger or equal than the complete score it add one to the completed number
            if(score >= complete_value):
                complete += 1
                #If the flag is true the agent ends the trainig after completing a number of episodes
                if end_on_complete and complete >= complete_num:
                    break
            
            #These information are printed after each verbose_batch episodes
            if((episode+1)%verbose_batch == 0):
                print("\r                                                                                                          ", end="")
                print("\rEpisodes: ", episode+1, "/", num_episodes
                      , "\n\tTotal reward: ", np.mean(scores_history[-verbose_batch:])
                      , "\n\tNum. steps: ", np.mean(steps_history[-verbose_batch:])
                      , "\n\tCompleted: ", complete, "\n--------------------------")
                
                #If the flag is true the agent act and render the episode after each verbose_batch episodes
                if act_after_batch: self.act(env)
                
                #Set the number of completed episodes on the batch to zero
                complete = 0

        print("\nFINISHED")
        return scores_history, steps_history
    
    #--------------------------------------------------------------------   
    
    def getSensorialData(self, sensorial_data):
        return smart_resize(sensorial_data, self.sensory_dim)
    
    #------------------------------------------------------------------   
    
    def save(self, path):
        self.actor.saveModel(path)
        self.critic.saveModel(path)
    
    #--------------------------------------------------------------------
    
    def load(self, a_path, c_path, ae_path):
        self.actor.loadModel(a_path)
        self.critic.loadModel(c_path)
    
    #--------------------------------------------------------------------   

Future exception was never retrieved
future: <Future finished exception=BrokenPipeError(32, 'Broken pipe')>
Traceback (most recent call last):
  File "/usr/lib/python3.10/asyncio/unix_events.py", line 676, in write
    n = os.write(self._fileno, data)
BrokenPipeError: [Errno 32] Broken pipe


# Testing Agent networks creators

In [23]:
name = "LunarLander-v2"
env = gym.make(name)

state_dim = env.observation_space.shape[0]
num_actions = env.action_space.n
sensory_dim = [64, 64]

memory_size = 5000
batch_size = 64
gamma = 0.99
a_lr = 1e-3
c_lr = 2e-3
tau = 5e-3
epsilon = 0.7
epsilon_decay = 0.99999
epsilon_min = 0.3

In [24]:
agent = DDPGAgent(state_dim, num_actions, sensory_dim, memory_size, batch_size, gamma, a_lr, c_lr, tau, epsilon, epsilon_decay, epsilon_min)

2022-12-03 20:18:40.540753: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-03 20:18:40.569076: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/user/.local/lib/python3.10/site-packages/cv2/../../lib64:
2022-12-03 20:18:40.569097: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-12-03 20:18:40.569535: I tensorflow/core/platform/cpu_feature_guard.cc:151] T

In [25]:
#plot_model(agent.actor.model, show_shapes=True)

In [26]:
num_episodes = 3000
verbose = True
verbose_batch = 60
end_on_complete = True
complete_num = 0.5
complete_value = 300
act_after_batch = True

scores, steps = agent.train(env, num_episodes, verbose, verbose_batch, end_on_complete, complete_num, complete_value, act_after_batch)

BEGIN

Episodes:  60 / 3000                                                                                          
	Total reward:  -166.67492396202832 
	Num. steps:  111.93333333333334 
	Completed:  0 
--------------------------
Episodes:  120 / 3000                                                                                         
	Total reward:  -219.98692788512804 
	Num. steps:  104.21666666666667 
	Completed:  0 
--------------------------
Episodes:  180 / 3000                                                                                         
	Total reward:  -264.7998017978248 
	Num. steps:  112.41666666666667 
	Completed:  0 
--------------------------
Episodes:  240 / 3000                                                                                         
	Total reward:  -276.61943236779064 
	Num. steps:  119.3 
	Completed:  0 
--------------------------
Episodes:  300 / 3000                                                                                      