# MINITAUR environment using Actor-Critic with PPO

Here is the basic start-up and exploritory code. Just for reference

In [1]:
import gym
import tensorflow as tf
from tensorflow import keras
import keras.backend as K
import numpy as np
import pybullet_envs.bullet.minitaur_gym_env as e


env = e.MinitaurBulletEnv(render=True)
env.reset()

observation, reward, done, info = env.step(env.action_space.sample())

print(f"Observation: \t{observation}")
print(f"Obs.shape: \t{observation.shape}")
print(f"Reward: \t{reward}")
print(f"Action space:\t{env.action_space}")
n_actions = 8

current_dir=C:\Users\chris\mlenv\lib\site-packages\pybullet_envs\bullet
urdf_root=C:\Users\chris\mlenv\lib\site-packages\pybullet_data


  logger.warn(


Observation: 	[ 1.51588651e+00  1.48957588e+00  1.51342599e+00  1.48823333e+00
  1.38942716e+00  1.62405239e+00  1.33288718e+00  1.57031624e+00
  1.71664568e+00  5.01122927e-01  3.04659423e+00  1.62053719e+00
 -1.25211397e+01  1.80980647e+01 -2.43548866e+01  9.61388203e+00
  5.70000000e+00  1.50358762e+00  2.76749294e+00  4.64877048e-01
 -7.91250017e-01  3.43689070e+00 -3.02175092e+00  5.96187700e-01
 -1.31213525e-03  2.20969637e-03 -2.27776516e-04  9.99996672e-01]
Obs.shape: 	(28,)
Reward: 	-0.0021622727035699304
Action space:	Box([-1. -1. -1. -1. -1. -1. -1. -1.], [1. 1. 1. 1. 1. 1. 1. 1.], (8,), float32)


## Parameters

Definition of the parameters of the model. 

In [2]:
gamma = 0.99
lambda_ = 0.95
clipping_value = 0.2
entropy_beta = 0.001
critic_discount = 0.5

## Developing loss functions

Creation of Log-Probability, Generalise Advantage Estimation and PPO loss function for the actor and critic

In [3]:
def calc_log_prob(mu_vals, sigma_vals, action_vals):
    sigma_vals = K.clip(sigma_vals, 1e-3, 1e20)
    p1 = - (action_vals - mu_vals)**2/(2*sigma_vals**2)
    p2 = - K.log(2*np.pi*sigma_vals**2)

    return p1 + p2


def calc_gae(values, masks, rewards, gamma, lmbda):
    returns = []
    gae = 0

    for index, value, mask, reward in reversed([[i, *x] for i,x in enumerate(zip(values, masks, rewards))]):
        delta = reward + gamma * values[index + 1] * mask - value
        gae = delta + gamma*lmbda*mask*gae
        returns.insert(0, gae + values[i])

    returns = np.array(returns)
    adv = returns - values[:-1]
    return returns, (adv - np.mean(adv))/(np.std(adv) + 1e-10)


def get_ppo_loss(log_prob_new, log_prob_old, advantages, discounted_rewards, values, sigma_vals):
    def loss(y_true, y_pred):
        # with open("ypred.txt", "a") as f:
        #     f.write(f"{y_pred}, {y_pred[0]}, {y_pred[1]}")
        # sigma_vals = y_pred[1]
        ratio = K.exp(log_prob_new - log_prob_old)

        p1 = ratio*advantages
        p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages

        actor_loss = -K.mean(K.min((p1, p2), axis=0), axis=0)
        critic_loss = K.mean(K.pow(discounted_rewards - values[:-1,:,:], 2), axis=0)

        entropy = K.mean(K.sqrt(2*np.pi*np.e*K.pow(sigma_vals, 2)), axis=0)

        total_loss = critic_discount*critic_loss + actor_loss - entropy*entropy_beta*K.mean(K.exp(log_prob_new)*log_prob_old, axis=0)
        
        

        with open("log.txt", "a") as f:
            f.write("\n".join([str(x) for x in [sigma_vals, ratio, p1, p2, actor_loss, critic_loss, entropy, total_loss]]))
            f.write("\n")
        return total_loss

    return loss

def get_ppo_loss_2(y_true, y_pred):
    # y_pred could be sigma or mu so isn't explicitly used
    # y_true contains all the information required
    
    # print(f"y_pred:\t{y_pred.shape}")
    # print(f"y_true:\t{y_true.shape}")

    # log_prob_new = y_true[0]
    # log_prob_old = y_true[1]
    # advantages = y_true[2]
    # discounted_rewards = y_true[3]
    # values = y_true[4]
    # sigma_vals = y_true[5]

    log_prob_new = y_true[:,0:n_actions]
    log_prob_old = y_true[:,n_actions: 2*n_actions]
    sigma_vals = y_true[:,n_actions*2: n_actions*3]
    advantages = y_true[:,n_actions*3: n_actions*3+1]
    discounted_rewards = y_true[:,n_actions*3 + 1: n_actions*3+2]
    values = y_true[:,n_actions*3 + 2: n_actions*3+3]
    
    # print("\nLoss shape review:")
    # print(f"\tLog prob new:\t{log_prob_new.shape}")
    # print(f"\tLog prob old:\t{log_prob_old.shape}")
    # print(f"\tAdvantages:\t{advantages.shape}")
    # print(f"\tDiscounted rewards:\t{discounted_rewards.shape}")
    # print(f"\tValues:\t{values.shape}")
    # print(f"\tSigma values:\t{sigma_vals.shape}")

    ratio = K.exp(log_prob_new - log_prob_old)

    p1 = ratio*advantages
    p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages

    actor_loss = -K.mean(K.min((p1, p2), axis=0), axis=0)
    critic_loss = K.mean(K.pow(discounted_rewards - values, 2), axis=0)

    entropy = K.sum(K.sqrt(2*np.pi*np.e*K.pow(sigma_vals, 2)), axis=0)

    total_loss = critic_discount*critic_loss + actor_loss - entropy*entropy_beta*K.mean(K.exp(log_prob_new)*log_prob_old, axis=0)
    
    # print(f"\tTotal loss:\t{total_loss.shape}")

    return total_loss



In [4]:
import sys
def get_ppo_loss_mu(y_true, y_pred):

    mu_vals = y_pred

    action_vals = y_true[:,0:n_actions]
    # mu_vals = y_true[:,n_actions: 2*n_actions]
    sigma_vals = y_true[:,n_actions*2: n_actions*3]
    log_prob_old = y_true[:,n_actions*3: n_actions*4]
    advantages = y_true[:,n_actions*4: n_actions*4+1]
    discounted_rewards = y_true[:,n_actions*4 + 1: n_actions*4+2]
    values = y_true[:,n_actions*4 + 2: n_actions*4+3]

    log_prob_new = calc_log_prob(mu_vals, sigma_vals, action_vals)

    ratio = K.exp(log_prob_new - log_prob_old)

    p1 = ratio*advantages
    p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages

    actor_loss = -K.mean(K.min((p1, p2), axis=0), axis=0)
    critic_loss = K.mean(K.pow(discounted_rewards - values, 2), axis=0)

    entropy = K.sum(K.sqrt(2*np.pi*np.e*K.pow(sigma_vals, 2)), axis=0)
 
    total_loss = critic_discount*critic_loss + actor_loss - entropy*entropy_beta*K.mean(K.exp(log_prob_new)*log_prob_old, axis=0)

    tf.print(action_vals, output_stream=sys.stdout)
    tf.print(mu_vals, output_stream=sys.stdout)
    tf.print(sigma_vals, output_stream=sys.stdout)
    tf.print(log_prob_old, output_stream=sys.stdout)
    tf.print(advantages, output_stream=sys.stdout)
    tf.print(discounted_rewards, output_stream=sys.stdout)
    tf.print(values, output_stream=sys.stdout)
    tf.print(log_prob_new, output_stream=sys.stdout)
    tf.print(ratio, output_stream=sys.stdout)
    tf.print(p1, output_stream=sys.stdout)
    tf.print(p2, output_stream=sys.stdout)
    tf.print(actor_loss, output_stream=sys.stdout)
    tf.print(critic_loss, output_stream=sys.stdout)
    tf.print(entropy, output_stream=sys.stdout)
    tf.print(total_loss, output_stream=sys.stdout)

    return total_loss


def get_ppo_loss_sigma(y_true, y_pred):
    sigma_vals = y_pred

    action_vals = y_true[:,0:n_actions]
    mu_vals = y_true[:,n_actions: 2*n_actions]
    # sigma_vals = y_true[:,n_actions*2: n_actions*3]
    log_prob_old = y_true[:,n_actions*3: n_actions*4]
    advantages = y_true[:,n_actions*4: n_actions*4+1]
    discounted_rewards = y_true[:,n_actions*4 + 1: n_actions*4+2]
    values = y_true[:,n_actions*4 + 2: n_actions*4+3]

    log_prob_new = calc_log_prob(mu_vals, sigma_vals, action_vals)

    ratio = K.exp(log_prob_new - log_prob_old)

    p1 = ratio*advantages
    p2 = K.clip(ratio, 1-clipping_value, 1+clipping_value)*advantages

    actor_loss = -K.mean(K.min((p1, p2), axis=0), axis=0)
    critic_loss = K.mean(K.pow(discounted_rewards - values, 2), axis=0)

    entropy = K.sum(K.sqrt(2*np.pi*np.e*K.pow(sigma_vals, 2)), axis=0)

    total_loss = critic_discount*critic_loss + actor_loss - entropy*entropy_beta*K.mean(K.exp(log_prob_new)*log_prob_old, axis=0)

    return total_loss

## Creating the Actor and Critic Networks

Note the actor network is outputing means and std deviations rather that probabilities like the discrete case, hence the 2 heads.


In [5]:
class Actor(keras.Model):


    def __init__(self, dims, outputs, dropout=0.3) -> None:
        super().__init__()

        # self.input_layer = keras.layers.Input(shape=input_dims)
        self.fc1 = keras.layers.Dense(dims, activation="relu")
        self.fc2 = keras.layers.Dense(dims, activation="relu")
        self.fc3 = keras.layers.Dense(dims, activation="relu")
        # Set of mu's and sigma's for gaussian distribution
        self.output_layer_mu = keras.layers.Dense(outputs, activation="tanh")
        self.output_layer_sigma = keras.layers.Dense(outputs, activation="relu")
        self.dropout=dropout

    def call(self, inputs, training=False):
        # x = self.input_layer(inputs)
        x = self.fc1(inputs)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc2(x)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc3(x)
        x = keras.layers.Dropout(self.dropout)(x)
        return self.output_layer_mu(x), self.output_layer_sigma(x)



class Critic(keras.Model):


    def __init__(self, dims, dropout=0.3) -> None:
        super().__init__()

        # self.input_layer = keras.layers.Input(shape=input_dims)
        self.fc1 = keras.layers.Dense(dims, activation="relu")
        self.fc2 = keras.layers.Dense(dims, activation="relu")
        self.fc3 = keras.layers.Dense(dims, activation="relu")
        self.output_layer = keras.layers.Dense(1, activation=None)
        self.dropout=dropout

    def call(self, inputs, training=False):
        # x = self.input_layer(inputs)
        x = self.fc1(inputs)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc2(x)
        x = keras.layers.Dropout(self.dropout)(x)
        x = self.fc3(x)
        x = keras.layers.Dropout(self.dropout)(x)
        return self.output_layer(x)

## Creating the training loops


In [6]:
total_episodes = 500
episode_count = 0
ppo_steps = 64

epochs = 8
batch_size = 32

actor_dims = 128
critic_dims = 128

actor = Actor(actor_dims, n_actions)
actor.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss="mse")

# Note loss doesnt have any effect here as it will never be trained
actor_old = Actor(actor_dims, n_actions)
actor_old.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss="mse")

critic = Critic(critic_dims)
critic.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss="mse")

In [7]:
while episode_count < total_episodes:
    observation = env.reset()

    # Gather information for PPO
    states, actions_record, values, rewards, masks, action_probability, action_probability_old, sigma_values_stored, mu_vals_stored = [[] for x in range(9)]
    state = observation.reshape((1, 28))
    for i in range(ppo_steps):

        # Get distribution parameters from state
        # print(actor.predict(state, steps=1))
        mu_vals, sigma_vals = [x[0] for x in actor.predict(state, steps=1)]
        mu_vals_old, sigma_vals_old = [x[0] for x in actor_old.predict(state, steps=1)]

        # with open("action.txt", "a") as f:
        #     f.write(f"{i}, {mu_vals}, {sigma_vals}, ")
        # Predict value of state
        q_value = critic.predict(state, steps=1)

        # Sample distributions to get actions and clip to required range
        # Action is only based on the new policy distributions
        actions = [np.random.normal(mu, sigma) for mu, sigma in zip(mu_vals, sigma_vals)]
        actions = np.clip(actions, -1, 1)
        action_probs = [calc_log_prob(mu, sigma, action) for mu, sigma, action in zip(mu_vals, sigma_vals, actions)]
        action_probs_old = [calc_log_prob(mu, sigma, action) for mu, sigma, action in zip(mu_vals_old, sigma_vals_old, actions)]

        # Take actions and observe reward
        # with open("action.txt", "a") as f:
        #     print(actions)
        #     f.write(f"{actions}\n")
        observation, reward, done, info = env.step(actions)

        # Note if the episode is done or not
        mask = not done

        states.append(state)
        actions_record.append(actions)
        values.append(q_value)
        masks.append(mask)
        rewards.append(reward)
        action_probability.append(action_probs)
        action_probability_old.append(action_probs_old)
        sigma_values_stored.append(sigma_vals)
        mu_vals_stored.append(mu_vals)

        state = observation.reshape((1, 28))
        if done: 
            env.reset()
            episode_count += 1

    values.append(critic.predict(state, steps=1))

    # Train model

    states = np.array(states)
    rewards = np.array(rewards)
    values = np.array(values)
    action_probability = np.array(action_probability)
    action_probability_old = np.array(action_probability_old)
    sigma_values_stored = np.array(sigma_values_stored)

    actions_record = np.array(actions_record)
    mu_vals_stored = np.array(mu_vals_stored)
    


    discounted_returns, advantages = calc_gae(values, masks, rewards, gamma, lambda_)

    # Save the weights to the current weights
    
    weights = actor.get_weights()
    actor_old.set_weights(weights)

    actor = Actor(actor_dims, n_actions)
    # Consider SDG for optimiser for both due to small size
    actor.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss=[get_ppo_loss_mu, get_ppo_loss_sigma])

    # Throw-away prediction to build
    actor.predict(state, steps=1)

    actor.set_weights(weights)

    advantages = np.squeeze(advantages, axis=2)
    discounted_returns = np.squeeze(discounted_returns, axis=2)
    values = np.squeeze(values, axis=2)


    concated_values = np.concatenate((
        actions_record, 
        mu_vals_stored,
        sigma_values_stored, 
        action_probability_old, 
        advantages, 
        discounted_returns, 
        values[:-1,:]
    ), axis=1)
    


    actor.fit(states, concated_values, epochs=epochs, batch_size=batch_size, verbose=1)
    critic.fit(states, rewards, epochs=epochs, batch_size=batch_size, verbose=0)


Epoch 1/8
[[-1 0.973055184 0.881126165 ... 1 -0.744962335 -1]
 [-1 0.569696844 0.685214818 ... 0.771984398 0.887431324 -1]
 [-1 0.974700868 0.382328838 ... 0.59104687 0.818250537 -1]
 ...
 [-0.983664274 0.999141037 0.0301823672 ... 1 0.268858612 -0.981758595]
 [-1 0.999839485 0.236202031 ... 0.996402502 -0.321993262 -1]
 [-1 0.967130899 0.867488 ... 0.929026484 -0.461301863 0.738431215]]
[[[0.33264029 0.998206 -0.978264749 ... 0.999866843 0.841936529 -0.999999464]]

 [[0.700737119 0.992080629 0.0922523364 ... 0.790472686 0.988018513 -0.999998689]]

 [[-0.770729184 -0.760421 0.955170512 ... 0.913155 0.700655043 -0.765928566]]

 ...

 [[-0.22342217 0.999871 -0.996751606 ... 0.984862685 -0.998660803 -0.999998033]]

 [[-0.998745441 0.999999702 1 ... 1 -0.99477762 -0.999910414]]

 [[0.548567355 -0.621183634 0.999971271 ... 0.973299 -0.883749247 0.0897901207]]]
[[4.30115318 0 0 ... 0.340997458 0 0.371017575]
 [1.83169723 0 0 ... 0 0 0.74191606]
 [2.15891123 0 0.126868337 ... 0 0 0.234225094]

ValueError: 0th action nan out of bounds.

# Testing Area
Here is where any testing code should be put. Its unlikely that this would ever get run outside of quick tests

In [None]:
import numpy as np

a = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9],
    [10,11,12],
    [13,14,15]
])

b = np.array([
    [11,22,0.33],
    [44,55,66],
    [0.77,0.88,0.99],
    [0.10,0.11,0.12],
    [131,0.141,151]
])

print(np.concatenate((a, b), axis=1))

In [None]:
a_ = tf.constant(a)
b_ = tf.constant(b)

tf.tile(a_, 2)