In [14]:
import tensorflow as tf
import numpy as np

class Buffer:
    def __init__(self, agent, buffer_capacity=100000, batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size

        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, agent.num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, agent.num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, agent.num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    @tf.function
    def update(
        self,
        state_batch,
        action_batch,
        reward_batch,
        next_state_batch,
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + agent.GAMMA * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)

@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))



In [15]:
import tensorflow as tf
import tensorflow_probability as tfp


class DDPG():

    def __init__(self, GAMMA=0.99, UB=1, LB = -1, CLR = 1e-3, ALR = 1e-3, BATCH = 8, TAU = 1e-3):

        super(DDPG, self).__init__()
        self.num_states = 3
        self.num_actions = 1
        self.GAMMA = GAMMA
        self.UPPER_BOUND = UB
        self.LOWER_BOUND = LB
        self.CRITIC_LR = CLR
        self.ACTOR_LR = ALR
        self.BATCH = BATCH
        self.TAU = TAU


        self.noise = tfp.distributions.Normal(0, 0.2)


    def Actor(self):
        # Initialize weights between -3e-3 and 3-e3
        last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

        inputs = tf.keras.layers.Input(shape=(self.num_states,))
        out = tf.keras.layers.Dense(256, activation="relu")(inputs)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(self.num_actions, activation="tanh", kernel_initializer=last_init)(out)

        # Our upper bound is 2.0 for Pendulum.
        outputs = outputs * self.UPPER_BOUND
        model = tf.keras.Model(inputs, outputs)
        return model


    def Critic(self):
        # State as input
        state_input = tf.keras.layers.Input(shape=(self.num_states))
        state_out = tf.keras.layers.Dense(16, activation="relu")(state_input)
        state_out = tf.keras.layers.Dense(32, activation="relu")(state_out)

        # Action as input
        action_input = tf.keras.layers.Input(shape=(self.num_actions))
        action_out = tf.keras.layers.Dense(32, activation="relu")(action_input)

        # Both are passed through seperate layer before concatenating
        concat = tf.keras.layers.Concatenate()([state_out, action_out])

        out = tf.keras.layers.Dense(256, activation="relu")(concat)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(1)(out)

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model


    def make_action(self, actor_model, state):
        sampled_actions = tf.squeeze(actor_model(state))
        #noise = noise_object()
        # Adding noise to action
        sampled_actions = sampled_actions.numpy() + self.noise.sample()

        # We make sure action is within bounds
        legal_action = np.clip(sampled_actions, self.LOWER_BOUND, self.UPPER_BOUND)

        return [np.squeeze(legal_action)]

    def initialize(self):
        
        actor = self.Actor()
        target_actor = self.Actor()
        
        critic = self.Critic()
        target_critic = self.Critic()
        
        critic_optimizer = tf.keras.optimizers.Adam(self.CRITIC_LR)
        actor_optimizer = tf.keras.optimizers.Adam(self.ACTOR_LR)
        
        target_actor.set_weights(actor.get_weights())
        target_critic.set_weights(critic.get_weights())
        
        return [actor, critic, target_actor, target_critic, critic_optimizer, actor_optimizer]

In [None]:

import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp

problem = "Pendulum-v1"
env = gym.make(problem)

agent = DDPG(ALR = 1e-3, CLR = 2e-3, TAU = 5e-3, GAMMA = 0.99)
agent.num_states = 3
agent.num_actions = 1

actor_model, critic_model, target_actor, target_critic, critic_optimizer, actor_optimizer = agent.initialize()

MAX_EPISODES = 100

buffer = Buffer(agent, 50000, 64)
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []

for ep in range(MAX_EPISODES):
    prev_state = env.reset()
    episodic_reward = 0
    noise = tfp.distributions.Normal(0, 0.3).sample()
    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = agent.make_action(actor_model, tf_prev_state)
        state, reward, done, info = env.step(action)
        
        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, agent.TAU)
        update_target(target_critic.variables, critic_model.variables, agent.TAU)
        
        env.render()
        # End this episode when `done` is True
        if done:
            break

        prev_state = state

    ep_reward_list.append(episodic_reward)

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)

# Save the weights
actor_model.save_weights("pendulum_actor.h5")
critic_model.save_weights("pendulum_critic.h5")

target_actor.save_weights("pendulum_target_actor.h5")
target_critic.save_weights("pendulum_target_critic.h5")


2022-04-12 10:58:02.521211: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-04-12 10:58:02.540562: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2894575000 Hz


Episode * 0 * Avg Reward is ==> -1336.3557892203871
Episode * 1 * Avg Reward is ==> -1248.1354799454991
Episode * 2 * Avg Reward is ==> -1340.7260758240084
Episode * 3 * Avg Reward is ==> -1476.042522766396
Episode * 4 * Avg Reward is ==> -1512.976164193194
Episode * 5 * Avg Reward is ==> -1519.0062679339353
Episode * 6 * Avg Reward is ==> -1536.1039331802583
Episode * 7 * Avg Reward is ==> -1545.7042825017527
Episode * 8 * Avg Reward is ==> -1568.6608825766318
Episode * 9 * Avg Reward is ==> -1534.5671781572357
Episode * 10 * Avg Reward is ==> -1485.3845146111116
Episode * 11 * Avg Reward is ==> -1463.8840392278828
Episode * 12 * Avg Reward is ==> -1429.3302558609807
Episode * 13 * Avg Reward is ==> -1392.7451813296934
Episode * 14 * Avg Reward is ==> -1351.2503106193856
Episode * 15 * Avg Reward is ==> -1305.8073438585066
Episode * 16 * Avg Reward is ==> -1265.744528198065
Episode * 17 * Avg Reward is ==> -1251.605503428689
Episode * 18 * Avg Reward is ==> -1212.292705128938
Episode 