### ***Deep Deterministic Policy Gradients***

In [1]:
import tensorflow as tf

from tensorflow import random_uniform_initializer
from tensorflow.keras.layers import Input, Dense, Concatenate, Lambda, BatchNormalization 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

import numpy as np

import gym

import time

#### **Exploration**
For continuous action spaces, exploration is done via adding noise to the action itself.  In the DDPG paper, the authors use Ornstein-Uhlenbeck Process to add noise to the action output. Is a type of noise that models Brownian motion (motion of particles in a fluid coliding with other particles at random).

In [2]:
#Ornstein-Uhlenbeck Noise 
class OUActionNoise(object):
    def __init__(self, mean, sigma=0.5, theta=0.2, dt=0.1, x0=None):
        self.mean = mean
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()
    
    #--------------------------------------------------------------------------------
    #Method that enables to write classes where the instances behave like functions and can be called like a function.    
    def __call__(self):
        x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        self.x_prev = x
        
        return x
    
    #--------------------------------------------------------------------------------
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mean)

In [3]:
a = np.zeros(15)
b = OUActionNoise(a)
a += b()
a

array([ 0.15987125, -0.03246613, -0.16896425, -0.03457285,  0.0023084 ,
        0.11552847, -0.1229943 , -0.08057529, -0.11437638, -0.05009399,
        0.1078746 ,  0.16073033,  0.08267794, -0.3372407 ,  0.16908056])

#### **Replay Buffer**
As used in Deep Q learning (and many other RL algorithms), DDPG also uses a replay buffer to sample experience to update neural network parameters.

In [4]:
#Replay Buffer 
class ReplayBuffer(object):
    def __init__(self, size, minibatch_size = None):
        '''
        Args:
            size (integer): The size of the replay buffer.              
            minibatch_size (integer): The sample size.
        '''
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState()
        self.max_size = size
        
    #--------------------------------------------------------------------------------    
    def append(self, state, action, reward, next_state, done):
        '''
        Args:
            state (Numpy array): The state.              
            action (integer): The action.
            reward (float): The reward.
            done (boolen): True if the next state is a terminal state and False otherwise.
                           Is transformed to integer so tha True = 1, False = 0
            next_state (Numpy array): The next state.           
        '''
        if self.size() == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, next_state, int(done)])
    
    #--------------------------------------------------------------------------------    
    def sample(self):
        '''
        Returns:
            A list of transition tuples including state, action, reward, terminal, and next_state
        '''
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]
    
    #--------------------------------------------------------------------------------    
    def size(self):
        '''
        Returns:
            Number of elements in the buffer
        '''
        return len(self.buffer)
    
    #--------------------------------------------------------------------------------
    def isMin(self):
        '''
        Returns:
            Boolean indicating if the memory have the minimum number of elements or not
        '''
        return (self.size() >= self.minibatch_size)
    
    #--------------------------------------------------------------------------------
    def empties(self):
        self.buffer.clear()
    
    #--------------------------------------------------------------------------------
    def getEpisode(self):
        '''
        Returns:
            List with all the elements in the buffer
        '''
        return self.buffer
        

#### **Actor (Policy) & Critic (Value)**
DDPG uses four neural networks: a Q network, a deterministic policy network, a target Q network, and a target policy network.

The Q network and policy network is very much like simple Advantage Actor-Critic, but in DDPG, the Actor directly maps states to actions (the output of the network directly the output) instead of outputting the probability distribution across a discrete action space

The target networks are time-delayed copies of their original networks that slowly track the learned networks.

In [5]:
class Actor(object):
    def __init__(self, inp_dim, fc1_dim, fc2_dim, fc3_dim, out_dim, act_range, lr, tau):
        #Network dimensions
        self.inp_dim = inp_dim
        self.fc1_dim = fc1_dim
        self.fc2_dim = fc2_dim
        self.fc3_dim = fc3_dim
        self.out_dim = out_dim
        #Range of the action space
        self.act_range = act_range
        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau
        #Optimizer learning rate
        self.lr = lr
        #Generates the optimization function
        self.optimizer = Adam(self.lr)
        #Generates the actor model
        self.model = self.buildNetwork()
        #Generates the actor target model
        self.target_model = self.buildNetwork()
        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
        
    #--------------------------------------------------------------------
    def buildNetwork(self):
        inp = Input(shape=(self.inp_dim,))
        
        f1 = 1 / np.sqrt(self.fc1_dim)
        fc1 = Dense(self.fc1_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f1, f1), bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64')(inp)
        norm1 = BatchNormalization(dtype='float64')(fc1)
        
        f2 = 1 / np.sqrt(self.fc2_dim)
        fc2 = Dense(self.fc2_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f2, f2), bias_initializer=random_uniform_initializer(-f2, f2), dtype='float64')(norm1)
        norm2 = BatchNormalization(dtype='float64')(fc2)
        
        f3 = 1 / np.sqrt(self.fc3_dim)
        fc3 = Dense(self.fc3_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f3, f3), bias_initializer=random_uniform_initializer(-f3, f3), dtype='float64')(norm2)
        norm3 = BatchNormalization(dtype='float64')(fc3)
        
        f3 = 0.003
        out = Dense(self.out_dim, activation='tanh', kernel_initializer=random_uniform_initializer(-f3, f3), bias_initializer=random_uniform_initializer(-f3, f3), dtype='float64')(norm3)
        lamb = Lambda(lambda i: i * self.act_range, dtype='float64')(out)
        
        return Model(inputs=[inp], outputs=[lamb])
    
    #--------------------------------------------------------------------
    def predict(self, states):
        return self.model([states], training=False)
    
    #--------------------------------------------------------------------
    def target_predict(self, states):
        return self.target_model([states], training=False)
    
    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save_weights(path + '_actor.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.model.load_weights(path)

In [6]:
class Critic(object):
    def __init__(self, state_inp_dim, state_fc1_dim, state_fc2_dim, action_inp_dim, action_fc1_dim, conc_fc1_dim, conc_fc2_dim, out_dim, lr, tau):
        #Network dimensions
        self.state_inp_dim = state_inp_dim
        self.state_fc1_dim = state_fc1_dim
        self.state_fc2_dim = state_fc2_dim
        self.action_inp_dim = action_inp_dim
        self.action_fc1_dim = action_fc1_dim
        self.conc_fc1_dim = conc_fc1_dim
        self.conc_fc2_dim = conc_fc2_dim
        self.out_dim = out_dim
        #Optimizer learning rate
        self.lr = lr
        #Define the critic optimizer
        self.optimizer = Adam(lr=self.lr)
        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau
        #Generate the critic network
        self.model = self.buildNetwork()
        #Generate the critic target network
        self.target_model = self.buildNetwork()
        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())

    #--------------------------------------------------------------------
    def buildNetwork(self):
        #State input network ---------
        s_inp = Input(shape=(self.state_inp_dim, ))
        
        f1 = 1 / np.sqrt(self.state_fc1_dim)
        s_fc1 = Dense(self.state_fc1_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f1, f1), bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64')(s_inp)
        s_norm1 = BatchNormalization(dtype='float64')(s_fc1)
        
        f2 = 1 / np.sqrt(self.state_fc2_dim)
        s_fc2 = Dense(self.state_fc2_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f2, f2), bias_initializer=random_uniform_initializer(-f2, f2), dtype='float64')(s_norm1)
        s_norm2 = BatchNormalization(dtype='float64')(s_fc2)
        
        #Action input network ---------
        a_inp = Input(shape=(self.action_inp_dim, ))
        
        f1 = 1 / np.sqrt(self.action_fc1_dim)
        a_fc1 = Dense(self.action_fc1_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f1, f1), bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64')(a_inp)
        a_norm1 = BatchNormalization(dtype='float64')(a_fc1)
        
        #Concatenate the two networks ---
        c_inp = Concatenate(dtype='float64')([s_norm2, a_norm1])
        
        #Creates the output network
        f1 = 1 / np.sqrt(self.conc_fc1_dim)
        c_fc1 = Dense(self.conc_fc1_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f1, f1), bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64')(c_inp)
        c_norm1 = BatchNormalization(dtype='float64')(c_fc1)
        
        f2 = 1 / np.sqrt(self.conc_fc2_dim)
        c_fc2 = Dense(self.conc_fc2_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f2, f2), bias_initializer=random_uniform_initializer(-f2, f2), dtype='float64')(c_norm1)
        c_norm2 = BatchNormalization(dtype='float64')(c_fc2)
        
        f3 = 0.003
        out = Dense(self.out_dim, activation='linear', kernel_initializer=random_uniform_initializer(-f3, f3), bias_initializer=random_uniform_initializer(-f3, f3), dtype='float64')(c_norm2)
        
        model = Model(inputs=[s_inp, a_inp], outputs=[out])
        
        return model
    
    #--------------------------------------------------------------------
    def predict(self, states, actions):
        return self.model([states, actions], training=False)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, actions):
        return self.target_model([states, actions], training=False)
    
    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save_weights(path + '_critic.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.model.load_weights(path)

In [7]:
class DDPGAgent(object):
    def __init__(self, state_dim, action_dim, action_min, action_max, memory_size, batch_size, gamma, a_lr, c_lr, tau, epsilon, epsilon_decay, epsilon_min):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_min = action_min
        self.action_max = action_max
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.a_lr = a_lr
        self.c_lr = c_lr
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        #Creates the Replay Buffer
        self.memory = ReplayBuffer(self.memory_size, self.batch_size)
        #Creates the actor
        self.actor = Actor(
            inp_dim=self.state_dim, 
            fc1_dim=512, 
            fc2_dim=256, 
            fc3_dim=64, 
            out_dim=self.action_dim, 
            act_range=self.action_max, 
            lr=self.a_lr, 
            tau=self.tau,
        )
        #Creates the critic
        self.critic = Critic(
            state_inp_dim=self.state_dim, 
            state_fc1_dim=512, 
            state_fc2_dim=256,
            action_inp_dim=self.action_dim, 
            action_fc1_dim=32,
            conc_fc1_dim=512, 
            conc_fc2_dim=256, 
            out_dim=1,
            lr=self.c_lr, 
            tau=self.tau,
        )
        
        #Creates the noise generator
        self.ou_noise = OUActionNoise(mean=np.zeros(action_dim))
        
    #-------------------------------------------------------------------- 
    def policy(self, state, explore=True):
        state = state[np.newaxis, :]
        action = self.actor.predict(state)[0]
        #Takes the exploration with the epsilon probability
        if explore and np.random.rand() < self.epsilon:
            action += self.ou_noise()
            
        action = np.clip(action, a_min=self.action_min, a_max=self.action_max)
        return action
    
    #--------------------------------------------------------------------
    def learn(self, state, action, reward, next_state, done):
        self.memory.append(state, action, reward, next_state, done)
        
        if self.memory.isMin():
            self.replay_memory()
        
    #--------------------------------------------------------------------    
    def replay_memory(self):
        # Get sample experiences from the replay buffer
        experiences = self.memory.sample()
        
        #Get each term of the esxperiences
        states = np.array([exp[0] for exp in experiences])
        actions = np.array([exp[1] for exp in experiences])
        rewards = np.array([exp[2] for exp in experiences])
        next_states = np.array([exp[3] for exp in experiences])
        done = np.array([int(exp[4]) for exp in experiences])
        
        #Change the dimensions of the rewards and done arrays
        rewards = rewards[:, np.newaxis]
        done = done[:, np.newaxis]
        
        #Train the critic
        with tf.GradientTape() as tape:
            #Compute the critic target values
            target_actions = self.actor.target_predict(next_states)
            y = rewards + self.gamma * self.critic.target_predict(next_states, target_actions) * (1 - done)
            #Compute the q_value of each next_state, next_action pair
            critic_value = self.critic.predict(states, actions)
            #Compute the critic loss 
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(critic_grad, self.critic.model.trainable_variables))
        
        #Train the actor
        with tf.GradientTape() as tape:
            acts = self.actor.predict(states)
            critic_grads = self.critic.predict(states, acts)
            #Used -mean as we want to maximize the value given by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_grads)
            
        actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grad, self.actor.model.trainable_variables))
        
        #Update the model weights
        self.actor.transferWeights()
        self.critic.transferWeights() 
        
        #Decay the epsilon value
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        #If its reach the minimum value it stops
        else:
            self.epsilon = self.epsilon_min
   
    #--------------------------------------------------------------------     
    def act(self, env):
        #Reset the envirorment
        observation = env.reset()
        done = False
        
        while not done:
            env.render()
            time.sleep(0.02)
            action = self.policy(observation, explore=False)
            new_observation, reward, done, _ = env.step(action)
            observation = new_observation
        
        env.close()
        
    #--------------------------------------------------------------------     
    def train(self, env, num_episodes, verbose, verbose_num, end_on_complete, complete_num, complete_value, act_after_batch):
        scores_history = []
        steps_history = []

        print("BEGIN\n")
        complete = 0
        
        for episode in range(num_episodes):
            done = False
            score = 0
            steps = 0
            observation = env.reset()
            
            while not done:
                action = self.policy(observation)
                
                if verbose:
                    print("\r                                                                                                     ", end="")
                    print("\rEpisode: "+str(episode+1)+"\t Step: "+str(steps)+"\tReward: "+str(score) ,end="")
                    
                new_observation, reward, done, _ = env.step(action)
                self.learn(observation, action, reward, new_observation, done)
                observation = new_observation
                score += reward
                steps += 1

            scores_history.append(score)
            steps_history.append(steps)
            
            #If the score is bigger or equal than the complete score it add one to the completed number
            if(score >= complete_value):
                complete += 1
                #If the flag is true the agent ends the trainig on the firs complete episode
                if end_on_complete and complete >= complete_num:
                    break
            
            #These information are printed after each verbose_num episodes
            if((episode+1)%verbose_num == 0):
                print("\r                                                                                                          ", end="")
                print("\rEpisodes: ", episode+1, "/", num_episodes
                      , "\n\tTotal reward: ", np.mean(scores_history[-verbose_num:])
                      , "\n\tNum. steps: ", np.mean(steps_history[-verbose_num:])
                      , "\n\tCompleted: ", complete, "\n--------------------------")
                
                #If the flag is true the agent act and render the episode after each verbose_num episodes
                if act_after_batch:
                    self.act(env)
                
                #Set the number of completed episodes on the batch to zero
                complete = 0

        print("\nFINISHED")
        
        return scores_history, steps_history
    #--------------------------------------------------------------------     
    def save(self, path):
        self.actor.saveModel(path)
        self.critic.saveModel(path)
    
    #--------------------------------------------------------------------
    def load(self, a_path, c_path):
        self.actor.loadModel(a_path)
        self.critic.loadModel(c_path)

In [8]:
name = "BipedalWalker-v3"
env = gym.make(name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_min = env.action_space.low
action_max = env.action_space.high

memory_size = 1000000
batch_size = 128
gamma = 0.99
a_lr = 2e-4
c_lr = 5e-4
tau = 5e-3
epsilon = 1
epsilon_decay = 0.9999
epsilon_min = 0.5

agent = DDPGAgent(state_dim, action_dim, action_min, action_max, memory_size, batch_size, gamma, a_lr, c_lr, tau, epsilon, epsilon_decay, epsilon_min)

  logger.warn(
2022-06-22 14:20:28.946930: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-22 14:20:30.481806: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/user/.local/lib/python3.10/site-packages/cv2/../../lib64:
2022-06-22 14:20:30.481865: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-06-22 14:20:30.510121: I tensorflow/core/platform/cpu_feature_

In [9]:
nets_path = os.path.abspath('')+'/networks/11.1_DDPG_BipedalWalker/'
agent.load(nets_path+"_actor.h5", nets_path+"_critic.h5")

In [10]:
num_episodes = 3000
verbose = True
verbose_num = 100
end_on_complete = True
complete_num = 1
complete_value = 300
act_after_batch = True

#agent.train(env, num_episodes, verbose, verbose_num, end_on_complete, complete_num, complete_value, act_after_batch)

In [10]:
agent.act(env)

In [12]:
#agent.save('/home/gustavo/PROG/RL_networks/11.1_DDPG_'+name+'/')