# Atari-PPO-RND implementation

PPO-RND test in the Atari environment

In [1]:
import gym
from gym.wrappers import AtariPreprocessing
from gym.wrappers import FrameStack
import numpy as np
import tensorflow as tf 
from tensorflow import keras
from keras import backend as K
from collections import deque
import random
from matplotlib import pyplot as plt
from threading import Thread
import math

# change keras setting to use the conv2d NN passing the channel first (as returned from the FrameStack wrapper)
K.set_image_data_format('channels_first')

## Memory

Class used to memorize the trajectory and calculate the advantage

In [2]:
class Memory(object):
    
    STATE = 0
    ACTION = 1
    ACTION_PROB = 2
    EXTRINSIC_REWARD = 3
    INTRINSIC_REWARD = 4
    DONE = 5
    
    def __init__(self, n_trajectories, gamma = 0.98, e_lambda_p = 0.96, i_lambda_p = 0.99):
        self.trajectories = np.empty(n_trajectories, dtype=object)
        self.gamma = gamma
        self.e_lambda_p = e_lambda_p
        self.i_lambda_p = i_lambda_p
              
    def collect(self, state, action, action_prob, extrinsic_reward, intrinsic_reward, done, i_episode):
        if (self.trajectories[i_episode] == None):
            self.trajectories[i_episode] = deque(maxlen=N_STEPS)
        self.trajectories[i_episode].append((state, action, action_prob, extrinsic_reward, intrinsic_reward, done))
        
    def calculate_advantages(self, reward_standard_deviation_estimate):
        advantages = []
        extrinsic_TDerrors = []
        intrinsic_TDerrors = [] #list of all the delta, used to uopdate the critic
        extrinsic_discounts = []
        intrinsic_discounts = []
        
        for trajectory in self.trajectories:
            
            advantage_trajectory = [] #list of advantages for each element in a single trajectory
            e_delta = []
            i_delta = []
            e_G = []
            i_G = []

            v_t = ppo.return_v_extrinsic_values(trajectory[-1][self.STATE])
            e_delta.append(trajectory[-2][self.EXTRINSIC_REWARD] + 
                           self.gamma*v_t - 
                           ppo.return_v_extrinsic_values(trajectory[-2][self.STATE] ))
            e_G.append(trajectory[-2][self.EXTRINSIC_REWARD] + self.gamma*v_t)
            e_old_advantage = e_delta[-1]
            
            v_t = ppo.return_v_intrinsic_values(trajectory[-1][self.STATE])
            #normalizing the intrinisc reward before calculating the advantage
            i_delta.append( (trajectory[-2][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate) + 
                           self.gamma*v_t - 
                           ppo.return_v_intrinsic_values(trajectory[-2][self.STATE] ))
            i_G.append(trajectory[-2][self.INTRINSIC_REWARD] + self.gamma*v_t)
            i_old_advantage = i_delta[-1]
            
            advantage_trajectory.append(e_old_advantage[0] + i_old_advantage[0])
            
            
            for i in range(len(trajectory)-3,-1,-1):    
                e_delta.append(trajectory[i+1][self.EXTRINSIC_REWARD] + 
                               self.gamma*ppo.return_v_extrinsic_values(trajectory[i+1][self.STATE]) -  
                               ppo.return_v_extrinsic_values(trajectory[i][self.STATE]))
                e_G.append(trajectory[i][self.EXTRINSIC_REWARD] + self.gamma*e_G[-1])
                new_advantage = e_delta[-1] + self.gamma*self.e_lambda_p*e_old_advantage                                    
                                                        
                e_old_advantage = new_advantage
                
                normalized_intrinsic_reward = trajectory[i][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate
                i_delta.append(normalized_intrinsic_reward + 
                               self.gamma*ppo.return_v_intrinsic_values(trajectory[i+1][self.STATE]) -  
                               ppo.return_v_intrinsic_values(trajectory[i][self.STATE]))
                i_G.append(normalized_intrinsic_reward + self.gamma*i_G[-1])
                new_advantage = i_delta[-1] + self.gamma*self.i_lambda_p*i_old_advantage                                    
                                                        
                i_old_advantage = new_advantage                                        
                
                advantage_trajectory.append(0.4*i_old_advantage[0] + 0.6*e_old_advantage[0])  
        
                                                                                                                                                                       
            extrinsic_TDerrors.append(e_delta)
            intrinsic_TDerrors.append(i_delta)
            
            extrinsic_discounts.append(e_G)
            intrinsic_discounts.append(i_G)
                           
            advantages.append(advantage_trajectory)
            
        #flat all trajectories in a single deque adding the advantages (easier to sample random batches)
        self.flat_trajectories(self.trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors, extrinsic_discounts, intrinsic_discounts)
    
    def flat_trajectories(self, trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors, extrinsic_G, intrinsic_G):
        
        size = 0
        for trajectory in trajectories:
            size = size + len(trajectory)
        
        self.flatten_trajectories = deque(maxlen=size)
        
        for trajectory, advantage, e_delta, i_delta, e_discount, i_discount in zip(trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors, extrinsic_G, intrinsic_G):
            for i in range(len(trajectory)-2,-1,-1):
                self.flatten_trajectories.append((trajectory[i][self.STATE], 
                                                  trajectory[i][self.ACTION], 
                                                  trajectory[i][self.ACTION_PROB], 
                                                  trajectory[i][self.EXTRINSIC_REWARD], 
                                                  trajectory[i][self.INTRINSIC_REWARD], 
                                                  advantage[len(trajectory)-2-i], 
                                                  e_delta[len(trajectory)-2-i], 
                                                  i_delta[len(trajectory)-2-i], 
                                                  e_discount[len(trajectory)-2-i], 
                                                  i_discount[len(trajectory)-2-i], 
                                                  trajectory[i][self.DONE]))
        
        
    #pick a random batch example from the flatten list of trajectories
    def sample_experiences(self, batch_size):
        if (len(self.flatten_trajectories) >= batch_size):
            indices = np.random.permutation(len(self.flatten_trajectories))[:batch_size]
        else:
            indices = np.random.permutation(len(self.flatten_trajectories))
        batch = [self.flatten_trajectories[index] for index in indices]
        #delete form the memory the used obervations
        for index in sorted(indices, reverse=True):
            del self.flatten_trajectories[index]
        states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, e_discounts, i_discounts, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(11)]
        return states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, e_discounts, i_discounts, dones
        
    def reset(self):
        for trajectory in self.trajectories:
            trajectory.clear()

# RND class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Predictor update:
minimize $ \hat{f}(x, \theta) - f(x) $

In [3]:
class RND(object):
    
    input_shape = [1,84,84] 
    n_outputs = 200
    
    N_intrinsic_rewards = 0 #number of intrinsic reward received
    intrinisc_reward_mean = 0.0 #mean of the intrinsic rewards received
    reward_M2 = 0.0 #sum of squares of differences from the current mean
    
    N_observations = 0 #number of observations received
    observations_mean = 0.0 #mean of the observations received
    observation_M2 = 0.0 #sum of squares of differences from the current mean
    
    def __init__(self, env, n_normalization_steps = 40):
        self.target = self.create_target()
        self.predictor = self.create_predictor()
        
        self.MSE = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.initialize_standard_deviation_estimate(env, n_normalization_steps)
        
    #create the NN of the target
    def create_target(self):
        target = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs) ])
        return target
        
    #create the NN of the predictor
    def create_predictor(self):
        predictor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs) ])
        return predictor
    
    def train_predictor(self, observations):
        # extrinsic critic (rewards from the envirnoment)
        observations = np.array(observations)
        observations = self.normalize_observations(observations)
        # covert shape [BATCH_SIZE, 4, 84, 84] in [BATCH_SIZE,1,84,84]
        observations = [observation[-1,0:observation.shape[1], 0:observation.shape[2]] for observation in observations]
        observations = tf.expand_dims(observations, axis = 1)
        target_values = self.target.predict(observations)
        with tf.GradientTape() as tape:
            all_values = self.predictor(observations)
            loss = tf.reduce_mean(self.MSE(target_values, all_values))
        grads = tape.gradient(loss, self.predictor.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.predictor.trainable_variables))
        
    def calculate_intrinsic_reward(self, observation):
        #passing a (4,84,84) stacked frame from the environment
        observation = np.array(observation)
        #picking the last frame
        observation = observation[-1, 0:observation.shape[1], 0:observation.shape[2]]
        #normalize the last frame
        s = self.calculate_observation_standard_deviation()
        observation = self.normalize_observation(observation, s)
        #calculate intrinsic reward on the last frame
        observation = tf.expand_dims(observation, axis=0)
        f_target = self.target.predict(tf.expand_dims(observation, axis=0))
        f_predictor = self.predictor.predict(tf.expand_dims(observation, axis=0))
        #return self.MSE(f_target, f_predictor)*10000
        return pow(np.linalg.norm(f_predictor - f_target), 2)*100
    
    def initialize_standard_deviation_estimate(self, env, n_normalization_steps):
        observations_mean = np.zeros(env.observation_space.shape ,'float64') #mean of the intrinsic rewards received
        observation_M2 = np.zeros(env.observation_space.shape ,'float64') #sum of squares of differences from the current mean
        
        obsevation = env.reset()
        
        for i_step in range(n_normalization_steps):
            random_action = env.action_space.sample()
            observation, reward, done, info = env.step(random_action)
            for frame in observation:
                self.update_observation_normalization_param(frame)
    
    def update_observation_normalization_param(self, observation):
        #cicle trhough the 4 images that makes up for an observation
        for obs in observation:
            #obs_mean = np.mean([obs_dim], axis=0)
            self.N_observations = self.N_observations + 1
            delta = obs - self.observations_mean
            self.observations_mean = self.observations_mean + delta/self.N_observations # mean_N = mean_{N-1} + (obs_t - mean_{N-1}) / N
            self.observation_M2 = self.observation_M2 + delta*(obs - self.observations_mean)
        
    def calculate_observation_standard_deviation(self):
        standard_deviation = np.sqrt( self.observation_M2 / (self.N_observations - 1))
        return standard_deviation
    
    def normalize_observations(self, observations):

        norm_obs = []
        s = self.calculate_observation_standard_deviation()
        for observation in observations:
            norm_obs.append(self.normalize_observation(observation, s))
        normalized_obs = tf.stack([norm_obs[i] for i in range(len(norm_obs))], 0)

        return normalized_obs
    
    def normalize_observation(self, observation, standard_deviation):
        t = observation - self.observations_mean
        normalized_obs = np.clip(np.divide(t, standard_deviation, out=np.zeros_like(t), where=standard_deviation!=0), a_min =-5, a_max = 5)       
        return normalized_obs
    
    #def normalize_observation(self, observation, standard_deviation):
    #    temp_obs = []
    #    for obs_dim in observation:
    #        t = obs_dim - self.observations_mean
    #        temp_obs.append(np.clip(np.divide(t, standard_deviation, out=np.zeros_like(t), where=standard_deviation!=0), a_min =-5, a_max = 5))
    #    normalized_obs = tf.squeeze(tf.stack([temp_obs[i] for i in range(len(temp_obs))], axis = 0))
    #    return normalized_obs
    
    #Using welford's algorithm
    def update_reward_normalization_param(self, i_reward):
        self.N_intrinsic_rewards = self.N_intrinsic_rewards + 1
        delta = i_reward - self.intrinisc_reward_mean
        self.intrinisc_reward_mean = self.intrinisc_reward_mean + delta/self.N_intrinsic_rewards # mean_N = mean_{N-1} + (i_t - mean_{N-1}) / N
        self.reward_M2 = self.reward_M2 + delta*(i_reward - self.intrinisc_reward_mean)
        
    def calculate_reward_standard_deviation(self):
        standard_deviation = math.sqrt( self.reward_M2 / (self.N_intrinsic_rewards - 1))
        print("===============================================================")
        print("===============================================================")
        print("STANDARD DEVIATION {}".format(standard_deviation))
        print("===============================================================")
        print("===============================================================")
        return standard_deviation
    
    def save(self, path = ".\\saved_weights\\rnd\\"):
        self.target.save_weights(path + 'target_weights.h5')
        self.predictor.save_weights(path + 'predictor_weights.h5')
        
    def load(self, path = ".\\saved_weights\\rnd\\"):
        self.target.load_weights(path + 'target_weights.h5')
        self.predictor.load_weights(path + 'predictor_weights.h5')

In [4]:
#basic_env = gym.make("GravitarNoFrameskip-v4", obs_type = "image")
#wrapped_env = AtariPreprocessing(basic_env)
#env = FrameStack(wrapped_env, 4)

#rnd = RND(env)

#env.reset()

#obs = []
#for _ in range(32):
#    random_action = env.action_space.sample()
#    new_obs, reward, done, info = env.step(random_action)
#    obs.append(new_obs)
    
#array = np.array(obs)
#rnd.train_predictor(array)

#i = rnd.calculate_intrinsic_reward(new_obs)

#print(i)

# PPO class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Actor update formula:
$ \theta_{t+1} = \theta_t + \alpha\nabla min(r_t(\theta)\hat{A}_t, clip(r_t(\theta),1-\epsilon,1+\epsilon)\hat{A}_t)$

Critic update formula:
$ w_{t+1} = w_t + \alpha\delta_t\nabla\hat{v}(s_t,w)$

Probability ratio $ r_t(\theta) \doteq $
$ \pi_\theta(a_t | s_t) \over \pi_{\theta_{old}}(a_t | s_t) $

Advantage:
$ \hat{A}_t \doteq \delta_t + (\gamma\lambda)\delta_{t+1} + (\gamma\lambda)^2\delta_{t+2} + ... + (\gamma\lambda)^{T-t+1}\delta_{T-1} = \delta_t + (\gamma\lambda)\hat{A}_{t+1}$

TDerror:
$ \quad \delta_t  \doteq $
$ r_t + \gamma\hat{v}(s_{t+1},w) - \hat{v}(s_t,w) $ $ \qquad $ (if $ s_{t+1} $ is terminal then $ \hat{v}(s_{t+1},w) = 0$)

In [4]:
class PPO(object):
    
    input_shape = [4,84,84] 
    n_outputs = 4 #6 #wrapped_env.action_space.n
    
    next_reward = 0
    
    def __init__(self, env, n_episodes = 1, train_steps = 100, epsilon = 0.2, alpha = 1, gamma = 0.4, e_lambda_par = 1, i_lambda_par = 1, n_normalization_steps = 300, train_predictor_keeping_prob = 0.25):
        self.actor = self.create_actor()
        self.intrinsic_critic = self.create_critic()
        self.extrinsic_critic = self.create_critic()
        
        self.MSE = tf.keras.losses.mean_squared_error
        
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=2e-4)
        self.extrinsic_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        self.intrinsic_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.epsilon = epsilon
        self.alpha = alpha
        #self.n_outputs = 6 #env.action_space.n
        
        self.train_steps = train_steps
        self.train_predictor_keeping_prob = train_predictor_keeping_prob
        
        self.memory = Memory(n_episodes, gamma, e_lambda_par, i_lambda_par)
        
        self.rnd = RND(env, n_normalization_steps)
        
    #create the NN of the actor
    # Given the state returns the probability of each action
    def create_actor(self):
        initializer = tf.keras.initializers.GlorotNormal()
        actor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="tanh", input_shape = self.input_shape, kernel_initializer=initializer),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="tanh", kernel_initializer=initializer),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="tanh", kernel_initializer=initializer),
            keras.layers.Dense(512, kernel_initializer=initializer),
            keras.layers.Dropout(0.3),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs, kernel_initializer=initializer, activation = 'softmax') ])
        return actor
       
    #create the NN of the critic
    # Given the state returns the value function
    def create_critic(self):
        critic = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(1) ])
        
        return critic
      
    def play_one_step(self, env, observation, i_episode):
        action, action_prob = self.select_action(observation)
        past_reward = self.next_reward
        next_observation, self.next_reward, done, info = env.step(action)
        
        #self.next_reward = float(self.next_reward) / 100
        
        #the normalization of the intrinisc reward after before training
        i_reward = self.rnd.calculate_intrinsic_reward(observation)
        
        self.rnd.update_reward_normalization_param(i_reward)
        self.rnd.update_observation_normalization_param(observation)
        
        self.memory.collect(observation, action, action_prob, past_reward, i_reward, done, i_episode)
        if (done):
            self.memory.collect(next_observation, action, action_prob, self.next_reward, i_reward, done, i_episode)
            
        return next_observation, action, past_reward, i_reward, done, info

        
    #select the action (returned as a number)
    def select_action(self, observation):
        
        action_probabilities = self.actor.predict(tf.expand_dims(np.array(observation) / 255, axis=0))[0]
        action = np.random.choice(a = len(action_probabilities), p = action_probabilities)
        
        return action, action_probabilities[action]
    
    def train(self, batch_size):
        self.memory.calculate_advantages(self.rnd.calculate_reward_standard_deviation())
        
        for i_step in range(self.train_steps):
            done = self.training_step(batch_size)
            if (done):
                break
        
        self.memory.reset()
        
    #training done on the memory (the advantages must be calculated before hand)
    def training_step(self, batch_size):
        #get experiences (parts of a trajectory) from the memory
        
        states, actions, actions_prob, extrinsic_rewards, intrinsic_rewards, advantages, extrinsic_TDerrors, intrinsic_TDerrors, extrinsic_discounts, intrinsic_discounts, dones = self.memory.sample_experiences(batch_size)
        
        done = False
        if (len(states) == 0):
            return True
        if (len(states) != batch_size):
            done = True

        #compute the values for the update of the actor
        
        mask = tf.one_hot(actions, self.n_outputs)
        
        states = np.array(states)
        
        og_states = states
        states = states/255
        
        with tf.GradientTape() as tape:
            current_actions_prob = self.actor(states)
            
            current_action_prob = tf.reduce_sum(current_actions_prob*mask, axis=1, keepdims=True)
            old_actions_prob = tf.reshape(tf.convert_to_tensor(actions_prob), [len(states), 1])
            probability_ratio = tf.divide(tf.math.log(current_action_prob + 1e-7), tf.math.log(old_actions_prob + 1e-7 ))
        
            #sobtitute nan values with zero (where given an array of True/false puy the element of the first array (tf.zeros_like(probability_ratio)) in the position where is True, the second (probability_ratio) where is False)
            probability_ratio = tf.where(tf.math.is_nan(probability_ratio), tf.zeros_like(probability_ratio), probability_ratio)
        
            surrogate_arg_1 = tf.convert_to_tensor([probability_ratio[index]*advantages[index] for index in range(len(advantages))])
            surrogate_arg_2 = tf.convert_to_tensor(np.array([tf.keras.backend.clip(probability_ratio,1-self.epsilon,1+self.epsilon)[index]*advantages[index] for index in range(len(advantages))]).flatten())
            
            L = 0 - tf.minimum( surrogate_arg_1 , surrogate_arg_2 ) 
            loss = tf.reduce_mean(L)

        actor_weights = self.actor.trainable_variables
        grads = tape.gradient(loss, actor_weights)
        self.actor_optimizer.apply_gradients(zip(grads, actor_weights))
        
        #update of the critic. The target is the TD error
        
        # extrinsic critic (rewards from the envirnoment)
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*extrinsic_discounts).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            v_values = self.extrinsic_critic(states)
            loss = tf.reduce_mean(self.MSE(target_v_values, v_values))
        grads = tape.gradient(loss, self.extrinsic_critic.trainable_variables)
        self.extrinsic_optimizer.apply_gradients(zip(grads, self.extrinsic_critic.trainable_variables))
        
        # intrinsic critic (rewards from the exploration)
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*intrinsic_discounts).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            v_values = self.intrinsic_critic(states)
            loss = tf.reduce_mean(self.MSE(target_v_values, v_values))
        grads = tape.gradient(loss, self.intrinsic_critic.trainable_variables)
        self.intrinsic_optimizer.apply_gradients(zip(grads, self.intrinsic_critic.trainable_variables))
        
        #since v changed we need to re-calculate the advantages
        #self.memory.calculate_advantages()
        
        keep = random.random()
        if (keep <= self.train_predictor_keeping_prob):
            self.rnd.train_predictor(og_states)
        
        return done
    
    def return_v_extrinsic_values(self, observation):
        v_e = self.extrinsic_critic.predict(tf.expand_dims(np.array(observation) / 255, axis=0))[0]
        return v_e
    
    def return_v_intrinsic_values(self, observation):
        v_i = self.intrinsic_critic.predict(tf.expand_dims(np.array(observation) / 255, axis=0))[0]
        return v_i
    
    def save(self, path = ".\\saved_weights\\rnd\\"):
        self.actor.save_weights(path + 'actor_weights.h5')
        self.extrinsic_critic.save_weights(path + 'e_critic_weights.h5')
        self.intrinsic_critic.save_weights(path + 'i_critic_weights.h5')
        self.rnd.save()
        
    def load(self, path = ".\\saved_weights\\rnd\\"):
        self.actor.load_weights(path + 'actor_weights.h5')
        self.extrinsic_critic.load_weights(path + 'e_critic_weights.h5')
        self.intrinsic_critic.load_weights(path + 'i_critic_weights.h5')
        self.rnd.load()

## Training

In [5]:
class collect_trajectory(Thread):
    
    def __init__(self, env, i_agent):
         
        Thread.__init__(self)   
        self.n_agent = i_agent
        self.rewards = [] 
        
        self.env = env
            
    def run(self):
        #print("Starting {}".format(self.n_agent))
        
        observation = self.env.reset()
        
        self.env.step(1)
        past_lives = 5
        
        extrinsic_episode_reward = 0.0
        intrinsic_episode_reward = 0.0
        self.n_episodes = 0.0
        self.extrinsic_tot_reward = 0.0
        self.intrinsic_tot_reward = 0.0
        
        for i_step in range(N_STEPS):   
            observation, action, extrinsic_reward, intrinsic_reward, done, info = ppo.play_one_step(self.env, observation, self.n_agent)

            current_lives = info['ale.lives']
    
            if (current_lives <= past_lives):
                past_lives = current_lives
                observation, reward, done, info = self.env.step(1)
            
            
            extrinsic_episode_reward = extrinsic_episode_reward + extrinsic_reward
            intrinsic_episode_reward = intrinsic_episode_reward + intrinsic_reward
            
            #continuing task. if an episode is done we continue until complting the number of steps
            if (done):
                observation = self.env.reset()
                
                past_lives = 5
                
                self.extrinsic_tot_reward = self.extrinsic_tot_reward + extrinsic_episode_reward
                self.intrinsic_tot_reward = self.intrinsic_tot_reward + intrinsic_episode_reward
                self.n_episodes = self.n_episodes + 1
                episode_reward = 0.0
                
        if (self.n_episodes == 0):
            self.extrinsic_tot_reward = extrinsic_episode_reward
            self.intrinsic_tot_reward = intrinsic_episode_reward
        
        self.env.close()
        
        if (self.n_episodes > 0):
            print("Exiting {} after {} episodes. Average ex reward: {} in reward: {}".format(self.n_agent, self.n_episodes, self.extrinsic_tot_reward/self.n_episodes, self.intrinsic_tot_reward/self.n_episodes))
        else:
            print("Exiting {} after {} episodes. Average ex reward: {} in reward: {}".format(self.n_agent, self.n_episodes, self.extrinsic_tot_reward, self.intrinsic_tot_reward))
    
    
    def get_reward_average(self):
        if (self.n_episodes > 0):
            return (self.extrinsic_tot_reward/self.n_episodes, self.intrinsic_tot_reward/self.n_episodes)
        else:
            return (self.extrinsic_tot_reward, self.intrinsic_tot_reward)
    

In [None]:
N_STEPS_NORMALIZATION = 50
N_EPOCHS = 100
N_EPISODES = 2 # in multi-agent this is the number of agents (each agnet collect 1 trajectory)
N_STEPS = 2000 # max number of step for each episode

TRAIN_STEPS = 300 # number of max steps done during training. if the number of samples is less than TRAIN_STEPS*BATCH_SIZE will stop early after completing the training on all the samples
BATCH_SIZE = 64

environment = "BreakoutNoFrameskip-v4" #"GravitarNoFrameskip-v4", "PongNoFrameskip-v4", "AirRaidNoFrameskip-v4"

#env used to initialize the parameters inside PPO and RND
norm_env = gym.make(environment, obs_type = "image")
norm_wrapped_env = AtariPreprocessing(norm_env)
norm_stack_env = FrameStack(norm_wrapped_env, 4)

ppo = PPO(norm_stack_env, n_episodes = N_EPISODES, train_steps = TRAIN_STEPS, n_normalization_steps = N_STEPS_NORMALIZATION, i_lambda_par = 2.25, train_predictor_keeping_prob = 0.6)

e_rewards = []
i_rewards = []

envs = []

for i_env in range(N_EPISODES):
        basic_env = gym.make(environment, obs_type = "image")
        wrapped_env = AtariPreprocessing(basic_env)
        envs.append(FrameStack(wrapped_env, 4))

highest_average_reward = 0
        
for i_epoch in range(N_EPOCHS):
    extrinsic_epoch_reward = 0.0
    intrinsic_epoch_reward = 0.0
    agents = []
    for i_agent in range(N_EPISODES):
        agents.append(collect_trajectory(env = envs[i_agent], i_agent = i_agent))
    for agent in agents:
        agent.start()
    for agent in agents:
        agent.join()
        extrinsic_reward_average, intrinsic_reward_average = agent.get_reward_average()
        extrinsic_epoch_reward = extrinsic_epoch_reward + extrinsic_reward_average
        intrinsic_epoch_reward = intrinsic_epoch_reward + intrinsic_reward_average
    e_rewards.append(extrinsic_epoch_reward/N_EPISODES)
    i_rewards.append(intrinsic_epoch_reward/N_EPISODES)
    print("Epoch: {} ended with average extrinsic reward: {} intrinsic reward {} \n".format(i_epoch, extrinsic_epoch_reward/N_EPISODES, intrinsic_epoch_reward/N_EPISODES) )  
    
    if (highest_average_reward <= e_rewards[-1]):
        highest_average_reward = e_rewards[-1]
        ppo.save()
    
    ppo.train(batch_size = BATCH_SIZE)
    
    
for i_env in range(N_EPISODES):
    envs[i_env].close()

Exiting 0 after 22.0 episodes. Average ex reward: 5.818181818181818 in reward: 6490.062065049034
Exiting 1 after 24.0 episodes. Average ex reward: 7.75 in reward: 6484.155351956982
Epoch: 0 ended with average extrinsic reward: 6.784090909090909 intrinsic reward 6487.108708503008 

STANDARD DEVIATION 2.037980205385447
Exiting 1 after 15.0 episodes. Average ex reward: 11.666666666666666 in reward: 337.3550634486901
Exiting 0 after 15.0 episodes. Average ex reward: 10.0 in reward: 358.93819310097274
Epoch: 1 ended with average extrinsic reward: 10.833333333333332 intrinsic reward 348.1466282748314 

STANDARD DEVIATION 2.908934785908415
Exiting 1 after 17.0 episodes. Average ex reward: 7.294117647058823 in reward: 31.005947957958462
Exiting 0 after 16.0 episodes. Average ex reward: 8.75 in reward: 28.413653557468265
Epoch: 2 ended with average extrinsic reward: 8.022058823529411 intrinsic reward 29.709800757713364 

STANDARD DEVIATION 2.719347251335083
Exiting 0 after 16.0 episodes. Averag

In [8]:
basic_env = gym.make("BreakoutNoFrameskip-v4", obs_type = "image")
wrapped_env = AtariPreprocessing(basic_env)
stack_env = FrameStack(wrapped_env, 4)

ppo = PPO(env = stack_env)
ppo.load()

observation = stack_env.reset()
observation, reward, done, info = stack_env.step(1)
print(observation)
print("Starting demo")
for i_step in range(200):   
    
    action, action_prob = ppo.select_action(observation)
    
    observation, reward, done, info = stack_env.step(action)
    #rand_action = stack_env.action_space.sample()
    #observation, reward, done, info = stack_env.step(rand_action)
    print("selected action {} with prob {} got reward {}".format(action, action_prob, reward))

    stack_env.render()
            
    if (done):
        break
        
stack_env.close()

<gym.wrappers.frame_stack.LazyFrames object at 0x000001E405E99770>
Starting demo
selected action 2 with prob 0.9751538634300232 got reward 0.0




selected action 2 with prob 0.9750579595565796 got reward 0.0
selected action 2 with prob 0.9753133654594421 got reward 0.0
selected action 2 with prob 0.9747897982597351 got reward 0.0
selected action 2 with prob 0.974651575088501 got reward 0.0
selected action 2 with prob 0.9752400517463684 got reward 0.0
selected action 2 with prob 0.9750487804412842 got reward 0.0
selected action 2 with prob 0.9751133918762207 got reward 0.0
selected action 2 with prob 0.9750691056251526 got reward 0.0
selected action 2 with prob 0.9751991033554077 got reward 0.0
selected action 2 with prob 0.9753149747848511 got reward 0.0
selected action 1 with prob 0.024170784279704094 got reward 0.0
selected action 2 with prob 0.9749774932861328 got reward 0.0
selected action 2 with prob 0.9750741720199585 got reward 0.0
selected action 2 with prob 0.9751831293106079 got reward 0.0
selected action 2 with prob 0.9749694466590881 got reward 0.0
selected action 2 with prob 0.9750814437866211 got reward 0.0
selecte

selected action 2 with prob 0.9751361608505249 got reward 0.0
selected action 2 with prob 0.9751361608505249 got reward 0.0
selected action 2 with prob 0.9751361608505249 got reward 0.0
selected action 2 with prob 0.9751361608505249 got reward 0.0
selected action 2 with prob 0.9751361608505249 got reward 0.0
selected action 2 with prob 0.9751361608505249 got reward 0.0
selected action 1 with prob 0.024104425683617592 got reward 0.0
selected action 2 with prob 0.9751887917518616 got reward 0.0
selected action 2 with prob 0.9750794768333435 got reward 0.0
selected action 2 with prob 0.9753044247627258 got reward 0.0
selected action 2 with prob 0.9748517274856567 got reward 0.0
selected action 2 with prob 0.9749240279197693 got reward 0.0
selected action 2 with prob 0.9752804040908813 got reward 0.0
selected action 2 with prob 0.9750686287879944 got reward 0.0
selected action 2 with prob 0.9751332402229309 got reward 0.0
selected action 2 with prob 0.975088894367218 got reward 0.0
selecte

## Plot graph

In [None]:
epochs = range(205)#N_EPOCHS)

plt.plot(epochs, i_rewards)

plt.xlabel("Epochs")
plt.ylabel("Rewards")

plt.show()