<a href="https://colab.research.google.com/github/dude123studios/AdvancedReinforcementLearning/blob/main/PPO_TF2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import *
import numpy as np
import gym
tfd = tfp.distributions

In [None]:
env = gym.make('Pendulum-v0')
state_shape = env.observation_space.shape[0]
action_shape = env.observation_space.shape[0]
action_bound = [env.action_space.low, env.action_space.high]

In [None]:
def policy_nw(state_shape, action_shape):
    inputs = Input(shape=(state_shape,))

    x = Dense(128, 'relu')(inputs)
    mu = 2 * Dense(action_shape, 'tanh', name='mu')(x)
    sigma = Dense(action_shape, tf.keras.activations.softplus, name='sigma')(x)

    return tf.keras.Model(inputs=inputs, outputs=[mu, sigma])

In [None]:
def value_nw(state_shape):
    return tf.keras.models.Sequential([
        Dense(128, 'relu'),
        Dense(1)
    ])

In [None]:
pi = policy_nw(state_shape, action_shape)
oldpi = policy_nw(state_shape, action_shape)
v = value_nw(state_shape)

In [None]:
#Define Optimizers
pi_optimizer = tf.keras.optimizers.Adam(1e-3)
v_optimizer = tf.keras.optimizers.Adam(2e-3)

In [None]:
#Hyper Parameters
num_episodes = 2000
num_timesteps = 200
gamma = 0.9
zeta = 0.3
beta = 0.2
epsilon = 0.2
batch_size = 32

In [None]:
def update_oldpi():
    for (a, b) in zip(oldpi.trainable_variables, pi.trainable_variables):
        a.assign(b)

In [None]:
def policy(state):
    state = state[np.newaxis, :]
    state = tf.convert_to_tensor(state)
    mu, sigma = pi(state)
    dist = tfd.Normal(mu[0], sigma[0])
    action = tf.squeeze(dist.sample(1), axis=0).numpy()
    clipped = np.clip(action, action_bound[0], action_bound[1])
    return clipped

In [None]:
def value(state):
    if state.ndim < 2: state = state[np.newaxis, :]
    state = tf.convert_to_tensor(state)
    return v(state).numpy()[0, 0]

In [None]:
#Train function
@tf.function
def train_networks(state, action, reward, _beta):

    #Copies old network
    update_oldpi()

    #Calculates advantage constant ahead of time for effeciency
    advantage_const = reward - v(state)

    # Loops update because at first pi/oldpi = 1, and we need to differentiate them more
    for _ in range(10):
        # Train policy network
        with tf.GradientTape() as tape:
            # Calculate pi distribution
            mu, sigma = pi(state)
            dist = tfd.Normal(mu, sigma)
            # Sample
            pi_prob = dist.prob(action)
            
            # Calculate oldpi distribution
            mu_, sigma_ = oldpi(state)
            dist_ = tfd.Normal(mu_, sigma_)
            # Sample
            oldpi_prob = dist_.prob(action)
            
            #Calculate KL Divergence in Penalty term
            kl_div = tfd.kl_divergence(dist, dist_)
            
            # Prevent NaN and calculate ratio
            ratio = pi_prob/(oldpi_prob + 1e-5)

            objective = ratio * advantage_const
            
            # Clip
            clipped = tf.minimum(objective, tf.clip_by_value(ratio, 1-epsilon, 1+epsilon))

            pi_loss = -tf.reduce_mean(clipped - _beta * kl_div)
        
        # Apply gradients
        grads = tape.gradient(pi_loss, pi.trainable_variables)
        pi_optimizer.apply_gradients(zip(grads, pi.trainable_variables))


    # Update beta by update rule
    mean_kl = tf.reduce_mean(kl_div)

    if mean_kl > 1.5 * zeta:
        _beta *= 2.0
    elif mean_kl < zeta/1.5:
        _beta *= 0.5

    #Train value network
    with tf.GradientTape() as tape:
        advantage = reward - v(state)
        v_loss = tf.reduce_mean(tf.square(advantage))

    grads = tape.gradient(v_loss, v.trainable_variables)
    v_optimizer.apply_gradients(zip(grads, v.trainable_variables))

    return _beta

In [None]:
#TRAIN!
for i in range(num_episodes):

    # Reset state, saved buffer, and return
    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []
    Return = 0

    for t in range(num_timesteps):
        
        #Select action
        action = policy(state)
        
        next_state, reward, done, _ = env.step(action)

        #Save to buffer
        episode_states.append(state)
        episode_rewards.append(reward)
        episode_actions.append(action)

        state = next_state
        Return += reward

        # Train step
        if (t+1) % batch_size == 0 or t == num_timesteps-1:
            v_s_ = value(state)

            # Calculate the disctounted reward from current state
            discounted_r = []
            for reward in episode_rewards[::-1]:
                v_s_ = reward + gamma * v_s_
                discounted_r.append(v_s_)
            discounted_r.reverse()

            #Prepare arrays for trainaing
            es, ea, er = np.vstack(episode_states), np.vstack(episode_actions), np.array(discounted_r, np.float32)[:, np.newaxis]
            es, ea, er = tf.convert_to_tensor(es), tf.convert_to_tensor(ea), tf.convert_to_tensor(er)

            #Train!
            beta = train_networks(es, ea, er, beta)
            #Reset buffer
            
            episode_states, episode_actions, episode_rewards = [], [], []

    if i % 10 == 0:
        print('Episode: {}, Return: {}'.format(i, Return)) 
    