In [19]:
import tensorflow as tf
import tensorflow_probability as tfp


class DDPG():

    def __init__(self, GAMMA=0.99, UB=1, LB = -1, CLR = 1e-3, ALR = 1e-3, BATCH = 8, TAU = 1e-3):

        self.num_states = 3
        self.num_actions = 1
        self.GAMMA = GAMMA
        self.UPPER_BOUND = UB
        self.LOWER_BOUND = LB
        self.CRITIC_LR = CLR
        self.ACTOR_LR = ALR
        self.BATCH = BATCH
        self.TAU = TAU
        self.buffer_capacity=int(100000)

        self.noise = tfp.distributions.Normal(0, 0.2)


    def getActor(self):
        # Initialize weights between -3e-3 and 3-e3
        last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

        inputs = tf.keras.layers.Input(shape=(self.num_states,))
        out = tf.keras.layers.Dense(256, activation="relu")(inputs)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(self.num_actions, activation="tanh", kernel_initializer=last_init)(out)

        # Our upper bound is 2.0 for Pendulum.
        outputs = outputs * self.UPPER_BOUND
        model = tf.keras.Model(inputs, outputs)
        return model


    def getCritic(self):
        # State as input
        state_input = tf.keras.layers.Input(shape=(self.num_states))
        state_out = tf.keras.layers.Dense(16, activation="relu")(state_input)
        state_out = tf.keras.layers.Dense(32, activation="relu")(state_out)

        # Action as input
        action_input = tf.keras.layers.Input(shape=(self.num_actions))
        action_out = tf.keras.layers.Dense(32, activation="relu")(action_input)

        # Both are passed through seperate layer before concatenating
        concat = tf.keras.layers.Concatenate()([state_out, action_out])

        out = tf.keras.layers.Dense(256, activation="relu")(concat)
        out = tf.keras.layers.Dense(256, activation="relu")(out)
        outputs = tf.keras.layers.Dense(1)(out)

        # Outputs single value for give state-action
        model = tf.keras.Model([state_input, action_input], outputs)

        return model


    def make_action(self, actor_model, state):
        sampled_actions = tf.squeeze(actor_model(state))
        # Adding noise to action
        sampled_actions = sampled_actions.numpy() + self.noise.sample()

        legal_action = np.clip(sampled_actions, self.LOWER_BOUND, self.UPPER_BOUND)

        return [np.squeeze(legal_action)]

    def initialize(self):
        
        actor = self.getActor()
        target_actor = self.getActor()
        
        critic = self.getCritic()
        target_critic = self.getCritic()
        
        critic_optimizer = tf.keras.optimizers.Adam(self.CRITIC_LR)
        actor_optimizer = tf.keras.optimizers.Adam(self.ACTOR_LR)
        
        target_actor.set_weights(actor.get_weights())
        target_critic.set_weights(critic.get_weights())
        
        self.getBuffer()
        
        return [actor, critic, target_actor, target_critic, critic_optimizer, actor_optimizer]

    
    def getBuffer(self):
        
        # Number of "experiences" to remember
        self.batch_size = self.BATCH
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, self.num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, self.num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, self.num_states))
        

    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    @tf.function
    def update(self, actor_model, critic_model, target_actor, target_critic,
               actor_optimizer, critic_optimizer,
               state_batch, action_batch, reward_batch, next_state_batch):
        
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + self.GAMMA * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )
        
    @tf.function
    def update_target(self, target_weights, weights):
        for (a, b) in zip(target_weights, weights):
            a.assign(b * agent.TAU + a * (1 - agent.TAU))
        
    def learn(self, actor_model, critic_model, target_actor, target_critic,
               actor_optimizer, critic_optimizer):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(actor_model, critic_model, target_actor, target_critic,
               actor_optimizer, critic_optimizer, state_batch, action_batch, reward_batch, next_state_batch)
        
        self.update_target(target_actor.variables, actor_model.variables)
        self.update_target(target_critic.variables, critic_model.variables)

In [22]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_probability as tfp

problem = "Pendulum-v1"
env = gym.make(problem)

agent = DDPG(ALR = 1e-3, CLR = 2e-3, TAU = 5e-3, GAMMA = 0.99)
agent.num_states = 3
agent.num_actions = 1
agent.BATCH = 64
actor_model, critic_model, target_actor, target_critic, critic_optimizer, actor_optimizer = agent.initialize()


MAX_EPISODES = 100
episodes_rewards= []

for ep in range(MAX_EPISODES):
    prev_state = env.reset()
    episodic_reward = 0
    noise = tfp.distributions.Normal(0, 0.3).sample()
    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
        action = agent.make_action(actor_model, tf_prev_state)
        state, reward, done, info = env.step(action)

        agent.record((prev_state, action, reward, state))
        episodic_reward += reward
        
        agent.learn(actor_model, critic_model, target_actor, target_critic, critic_optimizer, actor_optimizer)
        env.render()
        
        if done:
            break
            
        prev_state = state

    episodes_rewards.append(episodic_reward)

    avg_reward = np.mean(episodes_rewards)
    print("For episode {} the avg reward is {}".format(ep, avg_reward))

    if avg_reward > -200:
        print('Goal is reached! saving results..')
        actor_model.save_weights("pendulum_actor.h5")
        critic_model.save_weights("pendulum_critic.h5")

        target_actor.save_weights("pendulum_target_actor.h5")
        target_critic.save_weights("pendulum_target_critic.h5")
        break

ciao
ciao
For episode 0 the avg reward is -896.6578066442131
For episode 1 the avg reward is -1385.5627190535301
For episode 2 the avg reward is -1430.3574781079706
For episode 3 the avg reward is -1433.8430598517084
For episode 4 the avg reward is -1462.0648625267274
For episode 5 the avg reward is -1502.9833273173447
For episode 6 the avg reward is -1493.8186049164694
For episode 7 the avg reward is -1511.346387143422
For episode 8 the avg reward is -1497.7901046176353
For episode 9 the avg reward is -1462.3002784888645
For episode 10 the avg reward is -1419.6076071682926
For episode 11 the avg reward is -1373.545467218374
For episode 12 the avg reward is -1334.8697902019474
For episode 13 the avg reward is -1301.9797043256092
For episode 14 the avg reward is -1268.4357765581397
For episode 15 the avg reward is -1220.1605097894521
For episode 16 the avg reward is -1216.4263069285457
For episode 17 the avg reward is -1169.6552000582992
For episode 18 the avg reward is -1135.4865480984

KeyboardInterrupt: 