# PPO take 3241
Fingers crossed this works. Basically gonna be a load of set up code followed by networks, losses and training

In [1]:
import gym
import tensorflow as tf
from tensorflow import keras
import keras.backend as K
import numpy as np
import pybullet_envs.bullet.minitaur_gym_env as e


env = e.MinitaurBulletEnv(render=True)
env.reset()

observation, reward, done, info = env.step(env.action_space.sample())

print(f"Observation: \t{observation}")
print(f"Obs.shape: \t{observation.shape}")
print(f"Reward: \t{reward}")
print(f"Action space:\t{env.action_space}")
n_actions = 8

current_dir=C:\Users\chris\mlenv\lib\site-packages\pybullet_envs\bullet
urdf_root=C:\Users\chris\mlenv\lib\site-packages\pybullet_data


  logger.warn(


Observation: 	[ 1.54552142e+00  1.55034906e+00  1.36517043e+00  1.54409485e+00
  1.56431791e+00  1.54432572e+00  1.47025291e+00  1.46639854e+00
  4.80949978e+00  6.23190164e+00 -1.66361467e+01  8.80589769e+00
  7.41079391e+00  3.82691848e+00 -3.29042537e+00 -3.59800362e+00
  4.72611420e+00  3.68064658e+00 -9.32592172e-01  6.66243912e-01
  5.45497898e+00  5.70000000e+00 -2.93578618e-01  1.93627613e-01
 -1.36691707e-03 -6.02722887e-03 -2.55227516e-04  9.99980869e-01]
Obs.shape: 	(28,)
Reward: 	-0.0014916285219211476
Action space:	Box([-1. -1. -1. -1. -1. -1. -1. -1.], [1. 1. 1. 1. 1. 1. 1. 1.], (8,), float32)


## Parameters

In [2]:
gamma = 0.99
lambda_ = 0.95
clipping_value = 0.2
entropy_beta = 0.001

## Actor and Critic Networks

In [3]:
class Actor(keras.Model):


    def __init__(self, dims, outputs, dropout=0.3) -> None:
        super().__init__()
        self.fc1 = keras.layers.Dense(dims, activation="relu")
        self.fc2 = keras.layers.Dense(dims, activation="relu")
        self.fc3 = keras.layers.Dense(dims, activation="relu")
        # Set of mu's and sigma's for gaussian distribution
        self.output_layer_mu = keras.layers.Dense(outputs, activation="tanh", name="mu_output")
        self.output_layer_sigma = keras.layers.Dense(outputs, activation="softplus", name="sigma_output")
        self.dropout=dropout

    def call(self, inputs, training=False):
        x = self.fc1(inputs)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc2(x)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc3(x)
        x = keras.layers.Dropout(self.dropout)(x)
        return self.output_layer_mu(x), self.output_layer_sigma(x)



class Critic(keras.Model):


    def __init__(self, dims, dropout=0.3) -> None:
        super().__init__()
        self.fc1 = keras.layers.Dense(dims, activation="relu")
        self.fc2 = keras.layers.Dense(dims, activation="relu")
        self.fc3 = keras.layers.Dense(dims, activation="relu")
        self.output_layer = keras.layers.Dense(1, activation=None)
        self.dropout=dropout

    def call(self, inputs, training=False):
        x = self.fc1(inputs)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc2(x)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc3(x)
        x = keras.layers.Dropout(self.dropout)(x)
        return self.output_layer(x)


## Losses 

In [4]:
def calc_log_prob(mu_vals, sigma_vals, action_vals):
    sigma_vals = K.clip(sigma_vals, 1e-3, 1e20)
    p1 = - (action_vals - mu_vals)**2/(2*sigma_vals**2)
    p2 = - K.log(2*np.pi*sigma_vals**2)

    return p1 + p2


def calc_gae(values, masks, rewards, gamma, lmbda):
    returns = []
    gae = 0

    for index, value, mask, reward in reversed([[i, *x] for i,x in enumerate(zip(values, masks, rewards))]):
        delta = reward + gamma * values[index + 1] * mask - value
        gae = delta + gamma*lmbda*mask*gae
        returns.insert(0, gae + value)

    returns = np.array(returns)
    adv = returns - values[:-1]
    return returns, (adv - np.mean(adv))/(np.std(adv) + 1e-6)

### PPO loss calculations

In [5]:
def get_ppo_loss_mu(y_true, y_pred):
    mu_vals = y_pred

    actions = y_true[:,0:n_actions]
    # mu_vals = y_true[:,n_actions: 2*n_actions]
    sigma_vals = y_true[:,2*n_actions: 3*n_actions]
    log_prob_old = y_true[:,3*n_actions: 4*n_actions]
    advantages = y_true[:,4*n_actions: 4*n_actions+1]

    log_prob_new = calc_log_prob(mu_vals, sigma_vals, actions)

    ratio = K.exp(log_prob_new - log_prob_old)
    p1 = ratio * advantages
    p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages

    actor_loss = K.mean(K.min((p1, p2), axis=0), axis=0)
    
    return -actor_loss


def get_ppo_loss_sigma(y_true, y_pred):
    sigma_vals = y_pred

    actions = y_true[:,0:n_actions]
    mu_vals = y_true[:,n_actions: 2*n_actions]
    # sigma_vals = y_true[:,2*n_actions: 3*n_actions]
    log_prob_old = y_true[:,3*n_actions: 4*n_actions]
    advantages = y_true[:,4*n_actions: 4*n_actions+1]

    log_prob_new = calc_log_prob(mu_vals, sigma_vals, actions)

    ratio = K.exp(log_prob_new - log_prob_old)
    p1 = ratio * advantages
    p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages

    actor_loss = K.mean(K.min((p1, p2), axis=0), axis=0)

    #entropy = K.sum(K.sqrt(2*np.pi*np.e*K.pow(sigma_vals, 2)), axis=0)

    return -actor_loss# + entropy*entropy_beta*K.mean(K.exp(log_prob_new)*log_prob_old, axis=0)



In [6]:
def test_loss(actions, mu_vals, sigma_vals, log_prob_old, advantages):
    log_prob_new = calc_log_prob(mu_vals, sigma_vals, actions)
    print("Log prob new")
    print(log_prob_new)
    print("Log prob old")
    print(log_prob_old)
    ratio = K.exp(log_prob_new - log_prob_old)
    print("Ratio")
    print(ratio)
    p1 = ratio * advantages
    # print(p1)
    p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages
    # print(p2)
    actor_loss = K.mean(K.min((p1, p2), axis=0), axis=0)
    # print(actor_loss)
    return -actor_loss

## Running Sim
### Setting up

In [7]:
total_episodes = 500
episode_count = 0
ppo_steps = 64

sigma_start = 0.8
sigma_end = 0.05

epochs = 8
curr_epoch = 0
batch_size = 32

actor_dims = 128
critic_dims = 128


import datetime
# Load the TensorBoard notebook extension
%reload_ext tensorboard
log_dir_actor = "logs/fit/actor/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir_critic = "logs/fit/critic" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback_actor = tf.keras.callbacks.TensorBoard(log_dir=log_dir_actor, histogram_freq=1)
tensorboard_callback_critic = tf.keras.callbacks.TensorBoard(log_dir=log_dir_critic, histogram_freq=1)

file_writer = tf.summary.create_file_writer(log_dir_actor)


actor = Actor(actor_dims, n_actions)
actor.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), 
              loss={"output_1":get_ppo_loss_mu, "output_2":get_ppo_loss_sigma})

# Note loss doesnt have any effect here as it will never be trained
actor_old = actor#Actor(actor_dims, n_actions)
#actor_old.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss="mse")

critic = Critic(critic_dims)
critic.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss="mse")

### Running

In [8]:
state = env.reset().reshape((1, 28))

while episode_count < total_episodes:

    states_trace = []
    actions_trace = []
    values_trace = []
    masks = []
    rewards_trace = []
    log_prob_old_trace = []
    mu_trace = []
    sigma_trace = []
    sigma_trace2 = []


    for _ in range(ppo_steps):
        
        mu_vals, sigma_vals = [x[0] for x in actor.predict(state, steps=1)]
        mu_vals_old, sigma_vals_old = [x[0] for x in actor_old.predict(state, steps=1)]
        
        # sigma_trace2.append(sigma_vals)
        # sigma_vals2 = np.array([((sigma_start-sigma_end)/total_episodes)*episode_count+sigma_start for _ in range(n_actions)])
        
        actions = [np.clip(np.random.normal(mu, sigma), -1, 1) for mu, sigma in zip(mu_vals, sigma_vals)]
        log_prob_old = calc_log_prob(mu_vals_old, sigma_vals_old, actions)


        value = critic.predict(state, steps=1)

        observation, reward, done, info = env.step(actions)
        state = observation.reshape((1, 28))

        states_trace.append(state)
        actions_trace.append(actions)
        values_trace.append(value)
        masks.append(not done)
        rewards_trace.append(reward)
        log_prob_old_trace.append(log_prob_old)
        mu_trace.append(mu_vals)
        sigma_trace.append(sigma_vals)

        if done:
            state = env.reset().reshape((1, 28))
            episode_count += 1
            print(f"Episode Count: {episode_count}")
    
    values_trace.append(critic.predict(state, steps=1))

    states_trace = np.array(states_trace)
    actions_trace = np.array(actions_trace)
    values_trace = np.array(values_trace)
    masks = np.array(masks)
    rewards_trace = np.array(rewards_trace)
    log_prob_old_trace = np.array(log_prob_old_trace)
    mu_trace = np.array(mu_trace)
    # sigma_trace = np.array(sigma_trace)
    sigma_trace = np.array(sigma_trace, dtype=np.float64)

    discounted_rewards, advantages = calc_gae(
        values_trace, 
        masks, 
        rewards_trace, 
        gamma, 
        lambda_
    )

    weights = actor.get_weights()
    actor_old.set_weights(weights)


    advantages = np.squeeze(advantages, axis=2)
    # discounted_rewards = np.squeeze(discounted_rewards, axis=2)
    values_trace = np.squeeze(values_trace, axis=2)

    concated_values = np.concatenate((
        actions_trace,
        mu_trace,
        sigma_trace,
        log_prob_old_trace,
        advantages,
        # discounted_rewards,
        # values_trace[:-1],
    ), axis=1)


    # Log all the things
    with file_writer.as_default():
        tf.summary.scalar("Avg Reward", np.mean(rewards_trace), step=curr_epoch)
        tf.summary.scalar("Sigma", np.mean(np.mean(sigma_trace)), step=curr_epoch)


    

    actor.fit(states_trace, 
              concated_values, 
              initial_epoch=curr_epoch,
              epochs=curr_epoch + epochs, 
              batch_size=batch_size, 
              verbose=1, 
              callbacks=[tensorboard_callback_actor]
              )
              
    critic.fit(states_trace, 
               discounted_rewards, 
               initial_epoch=curr_epoch,
               epochs=curr_epoch + epochs, 
               batch_size=batch_size, 
               verbose=0, 
               callbacks=[tensorboard_callback_critic]
               )

    curr_epoch += epochs

    actor.save("actor", save_format="tf")
    critic.save("critic", save_format="tf")




Episode Count: 1
Log prob new
tf.Tensor(
[[-1.63576925e+00 -1.28979376e+00 -7.55353822e-01 -1.38785237e+00
  -8.92077730e-01 -1.63182408e+00 -2.15029342e+00 -1.02085603e+00]
 [-4.19374213e+00 -2.55939832e+00  9.56604995e-01 -2.10170751e+00
  -2.25027467e+00 -5.23159412e-01 -2.26802279e+00 -7.51706812e-01]
 [-2.44889372e+00 -2.89371584e+00  2.45344760e+00 -2.44746139e+00
  -2.19054178e+00 -1.94355308e+00 -2.40122890e+00 -4.35700211e-01]
 [-4.50455212e+00 -3.73521950e+00  5.08423995e+00 -4.31385344e+00
  -2.28650984e-01 -2.87707324e+00 -9.75794657e-01 -3.44186778e+00]
 [-3.54380296e+00 -4.96816582e+00  5.17421186e+00 -6.80759265e-01
  -2.05546230e+00 -4.00675184e+00 -2.49077723e+00 -1.18449575e+00]
 [-4.02863610e+00 -2.88590103e+00  5.54792854e+00 -2.55215347e+00
   1.88438694e+00 -3.01843233e+00 -1.38100766e+00 -2.24410881e+00]
 [-3.67664241e+00 -1.37777870e+00 -9.56015286e-01 -1.53443455e+00
   1.27056362e-01 -2.89431418e+00 -2.08249109e+00 -1.72723079e-01]
 [-3.66617261e+00 -3.3655935