In [47]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

#Simulation configuration
MAX_EPISODE = 1000


In [48]:
class DDPG():

    def __init__(self, GAMMA=0.99, EFFORT=1, CLR = 1e-3, ALR = 1e-4, BATCH = 8, TAU = 0.005,
    STD = 0.2, DT = 1e-3, THETA = 0.15):
        super(DDPG, self).__init__()
        self.num_states = 1
        self.num_actions = 1
        self.gamma = GAMMA
        self.max_effort = EFFORT
        self.CRITIC_LR = CLR
        self.ACTOR_LR = ALR
        self.BATCH = BATCH
        self.TAU = TAU
        self.THETA = THETA
        self.DT = DT
        self.NMEAN = np.zeros(self.num_actions)
        self.STD = np.float(STD)*np.ones(self.num_actions)

    def Actor(self):

        initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

        input = tf.keras.layers.Input(shape=(None,self.num_states))
        
        hidden = tf.keras.layers.Dense(400, 
        activation=tf.keras.layers.ReLU(), 
        kernel_initializer= initializer)(input)
        
        hidden = tf.keras.layers.Dense(300, 
        activation=tf.keras.layers.ReLU(), 
        kernel_initializer= initializer)(hidden)

        outputs = tf.keras.layers.Dense(
            self.num_actions, activation="tanh")(hidden)
        
        outputs = tf.keras.layers.experimental.preprocessing.Rescaling(self.max_effort)(outputs)    
        
        return tf.keras.Model(input, outputs)

    def Critic(self):

        initializer = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

        # State as input
        state_input = tf.keras.layers.Input(shape=(self.num_states) )
        
        state_out_critic = tf.keras.layers.Dense(400, 
        activation=tf.keras.layers.ReLU(),
         kernel_initializer= initializer)(state_input)
        
        state_out_critic = tf.keras.layers.Dense(300, 
        activation=tf.keras.layers.ReLU(),
         kernel_initializer= initializer)(state_out_critic)

        # Action as input
        action_input = tf.keras.layers.Input(shape=(self.num_actions))

        action_out_critic = tf.keras.layers.Dense(300,
        activation=tf.keras.layers.ReLU(),
         kernel_initializer= initializer)(action_input)

        # Concatening 2 networks
        concat = tf.keras.layers.Concatenate()(
            [state_out_critic, action_out_critic]
            )

        out = tf.keras.layers.Dense(128, 
            activation=tf.keras.layers.ReLU(),
            kernel_initializer= initializer)(concat)
        
        # Predicted Q(s,a)
        outputs = tf.keras.layers.Dense(1)(out)

        return tf.keras.Model([state_input, action_input], outputs)

    def initializer(self):
        global n

        n = 0
        actor_model = self.Actor()
        critic_model = self.Critic()

        target_actor = self.Actor()
        target_critic = self.Critic()

        critic_lr = self.CRITIC_LR
        actor_lr = self.ACTOR_LR

        critic_optimizer = tf.keras.optimizers.Adam(critic_lr )
        actor_optimizer = tf.keras.optimizers.Adam(actor_lr)


        models = [critic_model, actor_model, target_critic, target_actor]
        optimizers = [critic_optimizer, actor_optimizer]

        return models, optimizers 

    def update_target(self, target_weights, weights):
            for (a, b) in zip(target_weights, weights):
                a.assign(b * self.TAU + a * (1 - self.TAU))   
     
    def update(self, models, optimizers, state, action, reward, next_state, next_action, batch):

        state = np.array(state, dtype = np.float32)

        if batch == 1:
            state = state.reshape(len(state),self.num_states)
        else:
            state = state.reshape(1,4)
        state = tf.convert_to_tensor(state, dtype=tf.float32)

        next_state = np.array(next_state)
        if batch == 1:
            next_state = next_state.reshape(len(next_state),self.num_states)
        else:
            next_state = next_state.reshape(1,4)

        next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
        
        action = np.float32(action)
        if batch == 1:
            action = action.reshape(len(action),self.num_actions)
        else:
            action = tf.expand_dims(action, 0)
        action = tf.convert_to_tensor(action, dtype=tf.float32)

        next_action = np.float32(next_action)
        if batch == 1:
            next_action = next_action.reshape(len(next_action),self.num_actions)
        else:
            next_action = tf.expand_dims(next_action, 0)
        next_action = tf.convert_to_tensor(next_action, dtype=tf.float32)
        
        
        reward = tf.expand_dims(np.float32(reward), 0)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)

        with tf.GradientTape() as gradient:
                        
            target_actions = models[3](next_state, training = True)

            critic_q = models[0]([state, action], training=True)  # Q(s,a)

            y = reward + self.gamma * \
                models[2]([next_state, target_actions],
                             training=True)  
            
            critic_loss = tf.keras.losses.mean_squared_error(y_true=y, y_pred=critic_q)
            
            critic_grad = gradient.gradient(
                critic_loss, models[0].trainable_variables)
            
            critic_grad = [(tf.clip_by_value(grad, -1,1)) for grad in critic_grad]

            optimizers[0].apply_gradients(
                zip(critic_grad, models[0].trainable_variables))

        with tf.GradientTape() as gradient:

            actions = models[1](state, training=True)

            critic_q = models[0]([state, actions], training=True)

            actor_loss = tf.math.reduce_mean(-critic_q)

            actor_grad = gradient.gradient(
                actor_loss, models[1].trainable_variables)
            
            actor_grad = [(tf.clip_by_value(grad, -1,1)) for grad in actor_grad]
            
            optimizers[1].apply_gradients(
                zip(actor_grad, models[1].trainable_variables))

        self.update_target(models[3].variables, models[1].variables)
        self.update_target(models[2].variables, models[0].variables)

        return critic_loss, actor_loss

    def UONoise(self, bn):
        global n
        n = bn + self.THETA *(self.NMEAN - bn)*self.DT + self.STD + np.sqrt(self.DT) * np.random.normal(size=self.NMEAN.shape)

        return n

    def make_action(self, actor_model, state):
        global n
        #s_nn = np.array(state, dtype=np.float32)
        #s_nn = tf.expand_dims(s_nn, 0) #prepare state for NN 
        n = self.UONoise(n)

        return np.float32(actor_model(state) + n)

    def train(self, models, optimizers, total_trajectory):

        num_sample = len(total_trajectory)
        total_rew, atl, ctl = [], [], []

        if num_sample > 4:
            if num_sample < self.BATCH:
                print('Training without batch beacuse num of samples < batch size')
                for i in range(num_sample):
                    print('Learning from sample ', i+1)

                    closs, aloss = self.update(models, optimizers, 
                                            total_trajectory[i][0],
                                            total_trajectory[i][1], 
                                            total_trajectory[i][2],
                                            total_trajectory[i][3], 
                                            total_trajectory[i][4],
                                            0)

            else:
                for mb in range(num_sample//self.BATCH):
                    mini_state0 = []
                    mini_action0 = []
                    mini_state1 = []
                    mini_action1 = []
                    mini_reward = []
                    for sample in range(self.BATCH):
                        mini_state0.append(total_trajectory[mb*self.BATCH:self.BATCH*(mb+1)+1][sample][0])
                        mini_action0.append(total_trajectory[mb*self.BATCH:self.BATCH*(mb+1)+1][sample][1])
                        mini_reward.append(total_trajectory[mb*self.BATCH:self.BATCH*(mb+1)+1][sample][2])
                        mini_state1.append(total_trajectory[mb*self.BATCH:self.BATCH*(mb+1)+1][sample][3])
                        mini_action1.append(total_trajectory[mb*self.BATCH:self.BATCH*(mb+1)+1][sample][4])
                        
                    closs, aloss = self.update(models, optimizers,
                                                mini_state0,
                                                mini_action0, 
                                                mini_reward,
                                                mini_state1, 
                                                mini_action1,
                                                1)
                    
                    total_rew.append(sum(mini_reward)/len(mini_reward))
                    atl.append(aloss)
                    ctl.append(closs)
                    
            avg_reward = np.mean(total_rew)
            avg_atl = np.mean(atl)
            avg_ctl = np.mean(ctl)
            print("Avg reward{} , Avg actor loss is {}, Avg critic loss is {}".format(avg_reward, avg_atl, avg_ctl))
        else:
            print('[WARNING] Few samples acquired. Skipping training phase. ')
        return avg_reward

In [None]:
import gym
from stable_baselines.common import make_vec_env

envs = make_vec_env('Pendulum-v1')

print('[INFO] Global variables initialization...')

total_trajectory, tr = [], []

agents = DDPG()
agents.num_states = env.observation_space.shape[0]
agents.num_actions = env.action_space.shape[0]
agents.max_effort = 2
agents.gamma = 0.98
agents.std = 0.1
agents.ALC = 0.001
agents.CLR = 0.002
agents.BATCH = 16
models, optimizers = agents.initializer()


s0 = [0]*agents.num_states
a0 = np.array(0, dtype = np.float32)
r = np.array(0, dtype = np.float32)
G_t = np.array(0, dtype = np.float32)
i, j, t, started, avg_rew, episode, step, done = 0 , 0, 0, 0, 0, 0, 0, False
states, actions, traj, total_trajectory = [], [], [], []


while episode < MAX_EPISODE:
    state = envs.reset()
    while done == False:
        action = agents.make_action(models[1], state)
        next_state, reward, done, _ = envs.step(action)
        next_action = agents.make_action(models[1], next_state)
        total_trajectory.append([state, action, reward, next_state, next_action])
        envs.render()
        step += 1
        if done:
            print('[INFO] Simulation is finish. Learning for episode ', episode+1, ' /', MAX_EPISODE)
            agents.train(models, optimizers, total_trajectory)
            done = False
            episode += 1

[INFO] Global variables initialization...
[INFO] Simulation is finish. Learning for episode  1  / 1000
Avg reward-3.740112543106079 , Avg actor loss is 0.10846234112977982, Avg critic loss is 21.854019165039062
[INFO] Simulation is finish. Learning for episode  2  / 1000
Avg reward-5.199722766876221 , Avg actor loss is 0.3142763376235962, Avg critic loss is 15.242389678955078
[INFO] Simulation is finish. Learning for episode  3  / 1000
Avg reward-6.1664719581604 , Avg actor loss is 0.16486269235610962, Avg critic loss is 17.281787872314453
[INFO] Simulation is finish. Learning for episode  4  / 1000
Avg reward-6.754498481750488 , Avg actor loss is 0.0770951583981514, Avg critic loss is 15.352983474731445
[INFO] Simulation is finish. Learning for episode  5  / 1000
Avg reward-7.026749134063721 , Avg actor loss is 0.07093824446201324, Avg critic loss is 16.000022888183594
[INFO] Simulation is finish. Learning for episode  6  / 1000
Avg reward-7.100833415985107 , Avg actor loss is 0.06894