# Graviatar-PPO-RND implementation

PPO-RND test in the Gravitar environment

In [1]:
import gym
from gym.wrappers import AtariPreprocessing
import numpy as np
import tensorflow as tf 
from tensorflow import keras
from collections import deque
import random
from matplotlib import pyplot as plt
from threading import Thread
import math

## Memory

Class used to memorize the trajectory and calculate the advntage

In [2]:
class Memory(object):
    
    STATE = 0
    ACTION = 1
    ACTION_PROB = 2
    EXTRINSIC_REWARD = 3
    INTRINSIC_REWARD = 4
    DONE = 5
    
    def __init__(self, n_trajectories, gamma = 0.4):
        self.trajectories = np.empty(n_trajectories, dtype=object)
        self.gamma = gamma
              
    def collect(self, state, action, action_prob, extrinsic_reward, intrinsic_reward, done, i_episode):
        if (self.trajectories[i_episode] == None):
            self.trajectories[i_episode] = deque(maxlen=N_STEPS)
        self.trajectories[i_episode].append((state, action, action_prob, extrinsic_reward, intrinsic_reward, done))
        
    def calculate_advantages(self, reward_standard_deviation_estimate):
        self.advantages = []
        self.extrinsic_TDerrors = []
        self.intrinsic_TDerrors = [] #list of all the delta, used to uopdate the critic
        
        for trajectory in self.trajectories:
            
            advantage_trajectory = [] #list of advantages for each element in a single trajectory
            e_delta = []
            i_delta = []
            
            e_old_advantage = trajectory[-1][self.EXTRINSIC_REWARD]
            e_delta.append(e_old_advantage)
            
            #normalizing the intrinisc reward before calculating the TDError
            i_old_advantage = np.clip(trajectory[-1][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate, a_min =-5, a_max = 5)
            i_delta.append(i_old_advantage)
            
            advantage_trajectory.append(e_old_advantage + i_old_advantage)

            for i in range(len(trajectory)-2,-1,-1):
                e_delta.append(trajectory[i][self.EXTRINSIC_REWARD] + self.gamma*ppo.return_v_extrinsic_values(trajectory[i+1][self.STATE]) - ppo.return_v_extrinsic_values(trajectory[i][self.STATE]))
                new_advantage = e_delta[-1] + self.gamma*e_old_advantage  
                
                e_old_advantage = new_advantage
                
                normalized_intrinsic_reward = np.clip(trajectory[-1][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate, a_min =-5, a_max = 5)
                i_delta.append(normalized_intrinsic_reward + self.gamma*ppo.return_v_intrinsic_values(trajectory[i+1][self.STATE]) - ppo.return_v_intrinsic_values(trajectory[i][self.STATE]))
                new_advantage = i_delta[-1] + self.gamma*i_old_advantage 
                
                i_old_advantage = new_advantage
                
                advantage_trajectory.append(i_old_advantage[0] + e_old_advantage[0])  
               
            #reverse the list (at pos 0 there is the last advantage/delta)
            e_delta = list(reversed(e_delta))
            i_delta = list(reversed(i_delta))
        
            self.extrinsic_TDerrors.append(e_delta)
            self.intrinsic_TDerrors.append(i_delta)
            
            self.advantages.append(list(reversed(advantage_trajectory)))
            
        #flat all trajectories in a single deque adding the advantages (easier to sample random batches)
        self.flat_trajectories(self.trajectories, self.advantages, self.extrinsic_TDerrors, self.intrinsic_TDerrors)
    
    def flat_trajectories(self, trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors):
        
        size = 0
        for trajectory in trajectories:
            size = size + len(trajectory)
        
        self.flatten_trajectories = deque(maxlen=size)
        
        for trajectory, advantage, e_delta, i_delta in zip(trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors):
            for i in range(len(trajectory)):
                self.flatten_trajectories.append((trajectory[i][self.STATE], trajectory[i][self.ACTION], trajectory[i][self.ACTION_PROB], trajectory[i][self.EXTRINSIC_REWARD], trajectory[i][self.INTRINSIC_REWARD], advantage[i], e_delta[i], i_delta[i], trajectory[i][self.DONE]))
        
        
    #pick a random batch example from the flatten list of trajectories
    def sample_experiences(self, batch_size):
        if (len(self.flatten_trajectories) >= batch_size):
            indices = np.random.permutation(len(self.flatten_trajectories))[:batch_size]
        else:
            indices = np.random.permutation(len(self.flatten_trajectories))
        batch = [self.flatten_trajectories[index] for index in indices]
        #delete form the memory the used obervations
        for index in sorted(indices, reverse=True):
            del self.flatten_trajectories[index]
        states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(9)]
        return states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, dones
        
    def reset(self):
        for trajectory in self.trajectories:
            trajectory.clear()

# RND class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Predictor update formula:
$ \theta_{t+1} = \theta_t + \alpha\nabla min(r_t(\theta)\hat{A}_t, clip(r_t(\theta),1-\epsilon,1+\epsilon)\hat{A}_t)$

In [3]:
class RND(object):
    
    input_shape = [84,84,4] 
    n_outputs = 10
    
    N_intrinsic_rewards = 0 #number of intrinsic reward received
    intrinisc_reward_mean = 0.0 #mean of the intrinsic rewards received
    M2 = 0.0 #sum of squares of differences from the current mean
    
    def __init__(self, env, n_normalization_steps):
        self.target = self.create_target()
        self.predictor = self.create_predictor()
        
        self.MSE = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.initialize_standard_deviation_estimate(env, n_normalization_steps)
        
    #create the NN of the target
    def create_target(self):
        target = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs) ])
        return target
        
    #create the NN of the predictor
    def create_predictor(self):
        predictor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs) ])
        return predictor
    
    def train_predictor(self, observations):
        # extrinsic critic (rewards from the envirnoment)
        target_values = self.target.predict(observations)#tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*extrinsic_TDerrors).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            all_values = self.predictor(observations)
            #v_values = tf.reduce_sum(all_values, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.MSE(target_values, all_values))
        grads = tape.gradient(loss, self.predictor.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.predictor.trainable_variables))
        
    def calculate_intrinsic_reward(self, observation):
        f_target = self.target.predict(tf.expand_dims(observation, axis=0))
        f_predictor = self.predictor.predict(tf.expand_dims(observation, axis=0))
        return self.MSE(f_target, f_predictor)
    
    def initialize_standard_deviation_estimate(self, env, n_normalization_steps):
        obsevation = env.reset()
        
        for i_step in range(n_normalization_steps):
            random_action = env.action_space.sample()
            new_obs, reward, done, info = env.step(random_action)
            
            obs1, reward, done, info = env.step(0)
            obs2, reward, done, info = env.step(0)
            obs3, reward, done, info = env.step(0)

            observation = tf.transpose(tf.squeeze(tf.stack([new_obs, obs1, obs2, obs3], axis = 0)), [1,2,0])
            
            self.update_observation_normalization_param(observation)
    
    def update_observation_normalization_param(self, observation):
        return
    
    #Using welford's algorithm
    def update_reward_normalization_param(self, i_reward):
        self.N_intrinsic_rewards = self.N_intrinsic_rewards + 1
        delta = i_reward - self.intrinisc_reward_mean
        intrinisc_reward_mean = self.intrinisc_reward_mean + delta/self.N_intrinsic_rewards # mean_N = mean_{N-1} + (i_t - mean_{N-1}) / N
        self.M2 = self.M2 + delta*(i_reward - self.intrinisc_reward_mean)
        
    def calculate_reward_standard_deviation(self):
        standard_deviation = math.sqrt( self.M2 / (self.N_intrinsic_rewards - 1))
        print("===============================================================")
        print("===============================================================")
        print("STANDARD DEVIATION {}".format(standard_deviation))
        print("===============================================================")
        print("===============================================================")
        return standard_deviation
        

In [4]:
basic_env = gym.make("GravitarNoFrameskip-v4", obs_type = "image")
env = AtariPreprocessing(basic_env)

env.reset()

random_action = env.action_space.sample()
new_obs, reward, done, info = env.step(random_action)
            
obs1, reward, done, info = env.step(0)
obs2, reward, done, info = env.step(0)
obs3, reward, done, info = env.step(0)

observation = tf.transpose(tf.squeeze(tf.stack([new_obs, obs1, obs2, obs3], axis = 0)), [1,2,0])

for obs_dim in np.moveaxis(observation, -1, 0):
    

env.close()

IndentationError: expected an indented block (4252926563.py, line 18)

# PPO class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Actor update formula:
$ \theta_{t+1} = \theta_t + \alpha\nabla min(r_t(\theta)\hat{A}_t, clip(r_t(\theta),1-\epsilon,1+\epsilon)\hat{A}_t)$

Critic update formula:
$ w_{t+1} = w_t + \alpha\delta_t\nabla\hat{v}(s_t,w)$

Probability ratio $ r_t(\theta) \doteq $
$ \pi_\theta(a_t | s_t) \over \pi_{\theta_{old}}(a_t | s_t) $

Advantage:
$ \hat{A}_t \doteq \delta_t + (\gamma\lambda)\delta_{t+1} + (\gamma\lambda)^2\delta_{t+2} + ... + (\gamma\lambda)^{T-t+1}\delta_{T-1} = \delta_t + (\gamma\lambda)\hat{A}_{t+1}$

TDerror:
$ \quad \delta_t  \doteq $
$ r_t + \gamma\hat{v}(s_{t+1},w) - \hat{v}(s_t,w) $ $ \qquad $ (if $ s_{t+1} $ is terminal then $ \hat{v}(s_{t+1},w) = 0$)

In [5]:
class PPO(object):
    
    input_shape = [84,84,4] 
    n_outputs = 6 #wrapped_env.action_space.n
    
    def __init__(self, env, n_episodes = 1, train_steps = 100, epsilon = 0.2, alpha = 0.95, n_normalization_steps = 300):
        self.actor = self.create_actor()
        self.intrinsic_critic = self.create_critic()
        self.extrinsic_critic = self.create_critic()
        
        self.MSE = tf.keras.losses.mean_squared_error
        
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.epsilon = epsilon
        self.alpha = alpha
        #self.n_outputs = env.action_space.n
        
        self.train_steps = train_steps
        
        self.memory = Memory(n_episodes)
        
        self.rnd = RND(env, n_normalization_steps)
        
    #create the NN of the actor
    # Given the state returns the probability of each action
    def create_actor(self):    
        actor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs, activation = 'softmax') ])
        return actor
       
    #create the NN of the critic
    # Given the state returns the value function
    def create_critic(self):
        critic = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(1) ])
        
        return critic
      
    def play_one_step(self, env, observation, i_episode):
        action, action_prob = self.select_action(observation)
        observation, e_reward, done, info = env.step(action)
        
        i_reward = 0
        
        #put in wrapper
        e_reward = float(e_reward)/100.
        
        obs1, r, d, i = env.step(0)
        obs2, r, d, i = env.step(0)
        obs3, r, d, i = env.step(0)

        observation = tf.transpose(tf.squeeze(tf.stack([observation, obs1, obs2, obs3], axis = 0)), [1,2,0])
        
        i_reward = self.rnd.calculate_intrinsic_reward(observation)
        
        self.rnd.update_reward_normalization_param(i_reward)
        
        self.memory.collect(observation, action, action_prob, e_reward, i_reward, done, i_episode)
        
        return observation, action, e_reward, i_reward, done, info
        
    #select the action (returned as a number)
    def select_action(self, observation):
        
        # explanation: tf.expand_dims(observation['pov'], axis=0)
        # since we pass another input of shape (1,) -> we need to tell keras that is one image (it assumes the first dimension to be the batch)
        action_probabilities = self.actor.predict(tf.expand_dims(observation, axis=0))[0]
        
        #choosing an action usign randomly using a "roulette wheel" approach
        r = random.random()
        
        sum_probabilities = 0
        for i in range(len(action_probabilities)):
            sum_probabilities = sum_probabilities + action_probabilities[i]
            
            if (r <= sum_probabilities):
                action = i
                break
        
        return action, action_probabilities[action]
    
    def train(self, batch_size):
        self.memory.calculate_advantages(self.rnd.calculate_reward_standard_deviation())
        
        for i_step in range(self.train_steps):
            done = self.training_step(batch_size)
            if (done):
                break
        
        self.memory.reset()
        
    #training done on the memory (the advantages must be calculated before hand)
    def training_step(self, batch_size):
        #get experiences (parts of a trajectory) from the memory
        experiences = self.memory.sample_experiences(batch_size)
        
        states, actions, actions_prob, extrinsic_rewards, intrinsic_rewards, advantages, extrinsic_TDerrors, intrinsic_TDerrors, dones = experiences
        
        done = False
        if (len(states) != batch_size):
            done = True
        
        #compute the values for the update of the actor
        
        mask = tf.one_hot(actions, self.n_outputs)
        
        states = states/255
        
        with tf.GradientTape() as tape:
            current_actions_prob = self.actor(states)
            
            current_action_prob = tf.reduce_sum(current_actions_prob*mask, axis=1, keepdims=True)
            old_actions_prob = tf.reshape(tf.convert_to_tensor(actions_prob), [len(states), 1])
            probability_ratio = tf.divide(current_action_prob, old_actions_prob)
        
            surrogate_arg_1 = tf.convert_to_tensor([probability_ratio[index]*advantages[index] for index in range(len(advantages))])
            surrogate_arg_2 = tf.convert_to_tensor(np.array([tf.keras.backend.clip(probability_ratio,1-self.epsilon,1+self.epsilon)[index]*advantages[index] for index in range(len(advantages))]).flatten())
            
            L = 0 - tf.minimum( surrogate_arg_1 , surrogate_arg_2 ) 
            loss = tf.reduce_mean(L)

        actor_weights = self.actor.trainable_variables
        grads = tape.gradient(loss, actor_weights)
        self.optimizer.apply_gradients(zip(grads, actor_weights))
        
        #update of the critic. The target is the TD error
        
        # extrinsic critic (rewards from the envirnoment)
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*extrinsic_TDerrors).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            all_v_values = self.extrinsic_critic(states)
            v_values = tf.reduce_sum(all_v_values*mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.MSE(target_v_values, v_values))
        grads = tape.gradient(loss, self.extrinsic_critic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.extrinsic_critic.trainable_variables))
        
        # intrinsic critic (rewards from the exploration)
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*intrinsic_TDerrors).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            all_v_values = self.intrinsic_critic(states)
            v_values = tf.reduce_sum(all_v_values*mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.MSE(target_v_values, v_values))
        grads = tape.gradient(loss, self.intrinsic_critic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.intrinsic_critic.trainable_variables))
        
        #since v changed we need to re-calculate the advantages
        #self.memory.calculate_advantages()
        
        self.rnd.train_predictor(states)
        
        return done
    
    def return_v_extrinsic_values(self, observation):
        v_e = self.extrinsic_critic.predict(tf.expand_dims(observation, axis=0))[0]
        return v_e
    
    def return_v_intrinsic_values(self, observation):
        v_i = self.intrinsic_critic.predict(tf.expand_dims(observation, axis=0))[0]
        return v_i

## Training

In [6]:
class collect_trajectory(Thread):
    
    def __init__(self, i_agent = 1):
         
        Thread.__init__(self)   
        self.n_agent = i_agent
        self.rewards = [] 
        
        basic_env = gym.make("GravitarNoFrameskip-v4", obs_type = "image")
        self.wrapped_env = AtariPreprocessing(basic_env)
            
    def run(self):
        print("Starting {}".format(self.n_agent))
        
        observation = self.wrapped_env.reset()
        
        obs1, reward, done, info = self.wrapped_env.step(0)
        obs2, reward, done, info = self.wrapped_env.step(0)
        obs3, reward, done, info = self.wrapped_env.step(0)

        observation = tf.transpose(tf.squeeze(tf.stack([observation, obs1, obs2, obs3], axis = 0)), [1,2,0])
        
        extrinsic_episode_reward = 0.0
        intrinsic_episode_reward = 0.0
        self.n_episodes = 0.0
        self.extrinsic_tot_reward = 0.0
        self.intrinsic_tot_reward = 0.0
        
        for i_step in range(N_STEPS):   
            
            observation, action, extrinsic_reward, intrinsic_reward, done, info = ppo.play_one_step(self.wrapped_env, observation, self.n_agent)

            #wrapped_env.render()
            extrinsic_episode_reward = extrinsic_episode_reward + extrinsic_reward
            intrinsic_episode_reward = intrinsic_episode_reward + intrinsic_reward
            
            #continuing task. if an episode is done we continue until complting the number of steps
            if (done):
                observation = self.wrapped_env.reset()
        
                obs1, reward, done, info = self.wrapped_env.step(0)
                obs2, reward, done, info = self.wrapped_env.step(0)
                obs3, reward, done, info = self.wrapped_env.step(0)

                observation = tf.transpose(tf.squeeze(tf.stack([observation, obs1, obs2, obs3], axis = 0)), [1,2,0])
                
                self.extrinsic_tot_reward = self.extrinsic_tot_reward + extrinsic_episode_reward
                self.intrinsic_tot_reward = self.intrinsic_tot_reward + intrinsic_episode_reward
                self.n_episodes = self.n_episodes + 1
                episode_reward = 0.0
        
        self.wrapped_env.close()
        print("Exiting {} average ex reward: {} in reward: {}".format(self.n_agent, self.extrinsic_tot_reward/self.n_episodes, self.intrinsic_tot_reward/self.n_episodes))
        
    def get_reward_average(self):
        if (self.n_episodes > 0):
            return (self.extrinsic_tot_reward/self.n_episodes, self.intrinsic_tot_reward/self.n_episodes)
        else:
            return (self.extrinsic_tot_reward, self.intrinsic_tot_reward)
    

In [None]:
N_STEPS_NORMALIZATION = 500
N_EPOCHS = 100
N_EPISODES = 5 # in multi-agent this is the number of agents (each agnet collect 1 trajectory)
N_STEPS = 900 # max number of step for each episode

TRAIN_STEPS = 30 # number of max steps done during training. if the number of samples is less than TRAIN_STEPS*BATCH_SIZE will stop early after completing the training on all the samples
BATCH_SIZE = 64

#env used to initialize the parameters inside PPO and RND
env = gym.make("GravitarNoFrameskip-v4", obs_type = "image")
wrapped_env = AtariPreprocessing(env)

ppo = PPO(wrapped_env, n_episodes = N_EPISODES, train_steps = TRAIN_STEPS, n_normalization_steps = N_STEPS_NORMALIZATION)

e_rewards = []
i_rewards = []

for i_epoch in range(N_EPOCHS):
    extrinsic_epoch_reward = 0.0
    intrinsic_epoch_reward = 0.0
    agents = []
    for i_agent in range(N_EPISODES):
        agents.append(collect_trajectory(i_agent = i_agent))
    for agent in agents:
        agent.start()
    for agent in agents:
        agent.join()
        extrinsic_reward_average, intrinsic_reward_average = agent.get_reward_average()
        extrinsic_epoch_reward = extrinsic_epoch_reward + extrinsic_reward_average
        intrinsic_epoch_reward = intrinsic_epoch_reward + intrinsic_reward_average
    e_rewards.append(extrinsic_epoch_reward/N_EPISODES)
    i_rewards.append(intrinsic_epoch_reward/N_EPISODES)
    print("Epoch: {} ended with average extrinsic reward: {} intrinsic reward {} \n".format(i_epoch, extrinsic_epoch_reward/N_EPISODES, intrinsic_epoch_reward/N_EPISODES) )  
    ppo.train(batch_size = 32)
    

Starting 0
Starting 1
Starting 2
Starting 3
Starting 4
Exiting 2 average ex reward: 0.0 in reward: [963.3509]
Exiting 3 average ex reward: 0.0 in reward: [1020.39844]
Exiting 4 average ex reward: 0.0 in reward: [1169.9598]
Exiting 0 average ex reward: 0.0 in reward: [1067.2313]
Exiting 1 average ex reward: 0.0 in reward: [1028.1621]
Epoch: 0 ended with average extrinsic reward: 0.0 intrinsic reward [1049.8206] 

STANDARD DEVIATION 2.3359872053592876


In [None]:
#wrappend_env.reset()
#for i_step in range(N_STEPS_NORMALIZATION):
    
#    action = wrappend_env.action_space.sample() 
#    observation, reward, done, info = wrappend_env.step(action)
    #update observation normalization parameters usig s_t+1
#    t = t +1

## Plot graph

In [None]:
epochs = range(205)#N_EPOCHS)

plt.plot(epochs, i_rewards)

plt.xlabel("Epochs")
plt.ylabel("Rewards")

plt.show()