# Atari-PPO-RND implementation

PPO-RND test in the Atari environment

In [2]:
import gym
from gym.wrappers import AtariPreprocessing
from gym.wrappers import FrameStack
import numpy as np
import tensorflow as tf 
from tensorflow import keras
from keras import backend as K
from collections import deque
import random
from matplotlib import pyplot as plt
from threading import Thread
import math

# change keras setting to use the conv2d NN passing the channel first (as returned from the FrameStack wrapper)
K.set_image_data_format('channels_first')

## Memory

Class used to memorize the trajectory and calculate the advantage

In [2]:
class Memory(object):
    
    STATE = 0
    ACTION = 1
    ACTION_PROB = 2
    EXTRINSIC_REWARD = 3
    INTRINSIC_REWARD = 4
    DONE = 5
    
    def __init__(self, n_trajectories, gamma = 0.4, e_lambda_par = 1, i_lambda_par = 1):
        self.trajectories = np.empty(n_trajectories, dtype=object)
        self.gamma = gamma
        self.e_lambda_par = e_lambda_par
        self.i_lambda_par = i_lambda_par
              
    def collect(self, state, action, action_prob, extrinsic_reward, intrinsic_reward, done, i_episode):
        if (self.trajectories[i_episode] == None):
            self.trajectories[i_episode] = deque(maxlen=N_STEPS)
        self.trajectories[i_episode].append((state, action, action_prob, extrinsic_reward, intrinsic_reward, done))
        
    def calculate_advantages(self, reward_standard_deviation_estimate):
        self.advantages = []
        self.extrinsic_TDerrors = []
        self.intrinsic_TDerrors = [] #list of all the delta, used to uopdate the critic
        
        for trajectory in self.trajectories:
            
            advantage_trajectory = [] #list of advantages for each element in a single trajectory
            e_discounted_return = []
            i_discounted_return = []

            e_discounted_return.append(trajectory[-1][self.EXTRINSIC_REWARD])
            e_old_advantage = trajectory[-1][self.EXTRINSIC_REWARD] - ppo.return_v_values(trajectory[-1][self.STATE]) 
            
            #normalizing the intrinisc reward before calculating the advantage
            i_discounted_return.append(trajectory[-1][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate )
            i_old_advantage = trajectory[-1][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate - ppo.return_v_values(trajectory[-1][self.STATE]) 
            
            advantage_trajectory.append(e_old_advantage + i_old_advantage)

            for i in range(len(trajectory)-2,-1,-1):
                e_discounted_return.append(trajectory[i][self.EXTRINSIC_REWARD] + self.gamma*ppo.return_v_extrinsic_values(trajectory[i+1][self.STATE]))
                new_advantage = e_discounted_return[-1] - ppo.return_v_extrinsic_values(trajectory[-1][self.STATE]) + self.gamma*self.e_lambda_par*e_old_advantage
                
                e_old_advantage = new_advantage
                
                normalized_intrinsic_reward = trajectory[-1][self.INTRINSIC_REWARD] / reward_standard_deviation_estimate
                i_discounted_return.append(normalized_intrinsic_reward + self.gamma*ppo.return_v_intrinsic_values(trajectory[i+1][self.STATE]))
                new_advantage = i_discounted_return[-1] - ppo.return_v_intrinsic_values(trajectory[-1][self.STATE]) + self.gamma*self.i_lambda_par*i_old_advantage
                
                i_old_advantage = new_advantage
                
                advantage_trajectory.append(i_old_advantage[0] + e_old_advantage[0])  
        
            self.extrinsic_TDerrors.append(e_discounted_return)
            self.intrinsic_TDerrors.append(i_discounted_return)
            
            self.advantages.append(advantage_trajectory)
            
        #flat all trajectories in a single deque adding the advantages (easier to sample random batches)
        self.flat_trajectories(self.trajectories, self.advantages, self.extrinsic_TDerrors, self.intrinsic_TDerrors)
    
    def flat_trajectories(self, trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors):
        
        size = 0
        for trajectory in trajectories:
            size = size + len(trajectory)
        
        self.flatten_trajectories = deque(maxlen=size)
        
        for trajectory, advantage, e_delta, i_delta in zip(trajectories, advantages, extrinsic_TDerrors, intrinsic_TDerrors):
            for i in range(len(trajectory)-1,-1,-1):
                self.flatten_trajectories.append((trajectory[i][self.STATE], trajectory[i][self.ACTION], trajectory[i][self.ACTION_PROB], trajectory[i][self.EXTRINSIC_REWARD], trajectory[i][self.INTRINSIC_REWARD], advantage[i], e_delta[i], i_delta[i], trajectory[i][self.DONE]))
        
        
    #pick a random batch example from the flatten list of trajectories
    def sample_experiences(self, batch_size):
        if (len(self.flatten_trajectories) >= batch_size):
            indices = np.random.permutation(len(self.flatten_trajectories))[:batch_size]
        else:
            indices = np.random.permutation(len(self.flatten_trajectories))
        batch = [self.flatten_trajectories[index] for index in indices]
        #delete form the memory the used obervations
        for index in sorted(indices, reverse=True):
            del self.flatten_trajectories[index]
        states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(9)]
        return states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, dones
        
    def reset(self):
        for trajectory in self.trajectories:
            trajectory.clear()

# RND class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Predictor update:
minimize $ \hat{f}(x, \theta) - f(x) $

In [3]:
class RND(object):
    
    input_shape = [1,84,84] 
    n_outputs = 200
    
    N_intrinsic_rewards = 0 #number of intrinsic reward received
    intrinisc_reward_mean = 0.0 #mean of the intrinsic rewards received
    reward_M2 = 0.0 #sum of squares of differences from the current mean
    
    N_observations = 0 #number of observations received
    observations_mean = 0.0 #mean of the observations received
    observation_M2 = 0.0 #sum of squares of differences from the current mean
    
    def __init__(self, env, n_normalization_steps = 40):
        self.target = self.create_target()
        self.predictor = self.create_predictor()
        
        self.MSE = tf.keras.losses.mean_squared_error
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.initialize_standard_deviation_estimate(env, n_normalization_steps)
        
    #create the NN of the target
    def create_target(self):
        target = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs) ])
        return target
        
    #create the NN of the predictor
    def create_predictor(self):
        predictor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs) ])
        return predictor
    
    def train_predictor(self, observations):
        # extrinsic critic (rewards from the envirnoment)
        observations = np.array(observations)
        observations = self.normalize_observations(observations)
        # covert shape [BATCH_SIZE, 4, 84, 84] in [BATCH_SIZE,1,84,84]
        observations = [observation[-1,0:observation.shape[1], 0:observation.shape[2]] for observation in observations]
        observations = tf.expand_dims(observations, axis = 1)
        target_values = self.target.predict(observations)
        with tf.GradientTape() as tape:
            all_values = self.predictor(observations)
            loss = tf.reduce_mean(self.MSE(target_values, all_values))
        grads = tape.gradient(loss, self.predictor.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.predictor.trainable_variables))
        
    def calculate_intrinsic_reward(self, observation):
        #passing a (4,84,84) stacked frame from the environment
        observation = np.array(observation)
        #picking the last frame
        observation = observation[-1, 0:observation.shape[1], 0:observation.shape[2]]
        #normalize the last frame
        s = self.calculate_observation_standard_deviation()
        observation = self.normalize_observation(observation, s)
        #calculate intrinsic reward on the last frame
        observation = tf.expand_dims(observation, axis=0)
        f_target = self.target.predict(tf.expand_dims(observation, axis=0))
        f_predictor = self.predictor.predict(tf.expand_dims(observation, axis=0))
        #return self.MSE(f_target, f_predictor)*10000
        return pow(np.linalg.norm(f_predictor - f_target), 2)*100
    
    def initialize_standard_deviation_estimate(self, env, n_normalization_steps):
        observations_mean = np.zeros(env.observation_space.shape ,'float64') #mean of the intrinsic rewards received
        observation_M2 = np.zeros(env.observation_space.shape ,'float64') #sum of squares of differences from the current mean
        
        obsevation = env.reset()
        
        for i_step in range(n_normalization_steps):
            random_action = env.action_space.sample()
            observation, reward, done, info = env.step(random_action)
            for frame in observation:
                self.update_observation_normalization_param(frame)
    
    def update_observation_normalization_param(self, observation):
        #cicle trhough the 4 images that makes up for an observation
        for obs in observation:
            #obs_mean = np.mean([obs_dim], axis=0)
            self.N_observations = self.N_observations + 1
            delta = obs - self.observations_mean
            self.observations_mean = self.observations_mean + delta/self.N_observations # mean_N = mean_{N-1} + (obs_t - mean_{N-1}) / N
            self.observation_M2 = self.observation_M2 + delta*(obs - self.observations_mean)
        
    def calculate_observation_standard_deviation(self):
        standard_deviation = np.sqrt( self.observation_M2 / (self.N_observations - 1))
        return standard_deviation
    
    def normalize_observations(self, observations):

        norm_obs = []
        s = self.calculate_observation_standard_deviation()
        for observation in observations:
            norm_obs.append(self.normalize_observation(observation, s))
        normalized_obs = tf.stack([norm_obs[i] for i in range(len(norm_obs))], 0)

        return normalized_obs
    
    def normalize_observation(self, observation, standard_deviation):
        t = observation - self.observations_mean
        normalized_obs = np.clip(np.divide(t, standard_deviation, out=np.zeros_like(t), where=standard_deviation!=0), a_min =-5, a_max = 5)       
        return normalized_obs
    
    #def normalize_observation(self, observation, standard_deviation):
    #    temp_obs = []
    #    for obs_dim in observation:
    #        t = obs_dim - self.observations_mean
    #        temp_obs.append(np.clip(np.divide(t, standard_deviation, out=np.zeros_like(t), where=standard_deviation!=0), a_min =-5, a_max = 5))
    #    normalized_obs = tf.squeeze(tf.stack([temp_obs[i] for i in range(len(temp_obs))], axis = 0))
    #    return normalized_obs
    
    #Using welford's algorithm
    def update_reward_normalization_param(self, i_reward):
        self.N_intrinsic_rewards = self.N_intrinsic_rewards + 1
        delta = i_reward - self.intrinisc_reward_mean
        self.intrinisc_reward_mean = self.intrinisc_reward_mean + delta/self.N_intrinsic_rewards # mean_N = mean_{N-1} + (i_t - mean_{N-1}) / N
        self.reward_M2 = self.reward_M2 + delta*(i_reward - self.intrinisc_reward_mean)
        
    def calculate_reward_standard_deviation(self):
        standard_deviation = math.sqrt( self.reward_M2 / (self.N_intrinsic_rewards - 1))
        print("===============================================================")
        print("===============================================================")
        print("STANDARD DEVIATION {}".format(standard_deviation))
        print("===============================================================")
        print("===============================================================")
        return standard_deviation

In [4]:
#basic_env = gym.make("GravitarNoFrameskip-v4", obs_type = "image")
#wrapped_env = AtariPreprocessing(basic_env)
#env = FrameStack(wrapped_env, 4)

#rnd = RND(env)

#env.reset()

#obs = []
#for _ in range(32):
#    random_action = env.action_space.sample()
#    new_obs, reward, done, info = env.step(random_action)
#    obs.append(new_obs)
    
#array = np.array(obs)
#rnd.train_predictor(array)

#i = rnd.calculate_intrinsic_reward(new_obs)

#print(i)

# PPO class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Actor update formula:
$ \theta_{t+1} = \theta_t + \alpha\nabla min(r_t(\theta)\hat{A}_t, clip(r_t(\theta),1-\epsilon,1+\epsilon)\hat{A}_t)$

Critic update formula:
$ w_{t+1} = w_t + \alpha\delta_t\nabla\hat{v}(s_t,w)$

Probability ratio $ r_t(\theta) \doteq $
$ \pi_\theta(a_t | s_t) \over \pi_{\theta_{old}}(a_t | s_t) $

Advantage:
$ \hat{A}_t \doteq \delta_t + (\gamma\lambda)\delta_{t+1} + (\gamma\lambda)^2\delta_{t+2} + ... + (\gamma\lambda)^{T-t+1}\delta_{T-1} = \delta_t + (\gamma\lambda)\hat{A}_{t+1}$

TDerror:
$ \quad \delta_t  \doteq $
$ r_t + \gamma\hat{v}(s_{t+1},w) - \hat{v}(s_t,w) $ $ \qquad $ (if $ s_{t+1} $ is terminal then $ \hat{v}(s_{t+1},w) = 0$)

In [5]:
class PPO(object):
    
    input_shape = [4,84,84] 
    #n_outputs = 6 #wrapped_env.action_space.n
    
    def __init__(self, env, n_episodes = 1, train_steps = 100, epsilon = 0.2, alpha = 0.95, gamma = 0.4, e_lambda_par = 1, i_lambda_par = 1, n_normalization_steps = 300, train_predictor_keeping_prob = 0.25):
        self.actor = self.create_actor()
        self.intrinsic_critic = self.create_critic()
        self.extrinsic_critic = self.create_critic()
        
        self.MSE = tf.keras.losses.mean_squared_error
        
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.epsilon = epsilon
        self.alpha = alpha
        self.n_outputs = env.action_space.n
        
        self.train_steps = train_steps
        self.train_predictor_keeping_prob = train_predictor_keeping_prob
        
        self.memory = Memory(n_episodes, gamma, e_lambda_par, i_lambda_par)
        
        self.rnd = RND(env, n_normalization_steps)
        
    #create the NN of the actor
    # Given the state returns the probability of each action
    def create_actor(self):    
        actor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs, activation = 'softmax') ])
        return actor
       
    #create the NN of the critic
    # Given the state returns the value function
    def create_critic(self):
        critic = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(1) ])
        
        return critic
      
    def play_one_step(self, env, observation, i_episode):
        action, action_prob = self.select_action(observation)
        observation, e_reward, done, info = env.step(action)
        
        #put in wrapper
        e_reward = float(e_reward)/100.
        
        #the normalization of the intrinisc reward after before training
        i_reward = self.rnd.calculate_intrinsic_reward(observation)
        
        self.rnd.update_reward_normalization_param(i_reward)
        self.rnd.update_observation_normalization_param(observation)
        
        self.memory.collect(observation, action, action_prob, e_reward, i_reward, done, i_episode)
        
        return observation, action, e_reward, i_reward, done, info
        
    #select the action (returned as a number)
    def select_action(self, observation):
        
        # explanation: tf.expand_dims(observation['pov'], axis=0)
        # since we pass another input of shape (1,) -> we need to tell keras that is one image (it assumes the first dimension to be the batch)
        action_probabilities = self.actor.predict(tf.expand_dims(observation, axis=0))[0]
        
        #choosing an action usign randomly using a "roulette wheel" approach
        r = random.random()
        
        sum_probabilities = 0
        for i in range(len(action_probabilities)):
            sum_probabilities = sum_probabilities + action_probabilities[i]
            
            if (r <= sum_probabilities):
                action = i
                break
        
        return action, action_probabilities[action]
    
    def train(self, batch_size):
        self.memory.calculate_advantages(self.rnd.calculate_reward_standard_deviation())
        
        for i_step in range(self.train_steps):
            done = self.training_step(batch_size)
            if (done):
                break
        
        self.memory.reset()
        
    #training done on the memory (the advantages must be calculated before hand)
    def training_step(self, batch_size):
        #get experiences (parts of a trajectory) from the memory
        experiences = self.memory.sample_experiences(batch_size)
        
        states, actions, actions_prob, extrinsic_rewards, intrinsic_rewards, advantages, extrinsic_TDerrors, intrinsic_TDerrors, dones = experiences
        
        done = False
        if (len(states) != batch_size):
            done = True
        
        #compute the values for the update of the actor
        
        mask = tf.one_hot(actions, self.n_outputs)
        
        states = np.array(states)
        
        og_states = states
        states = states/255
        
        with tf.GradientTape() as tape:
            current_actions_prob = self.actor(states)
            
            current_action_prob = tf.reduce_sum(current_actions_prob*mask, axis=1, keepdims=True)
            old_actions_prob = tf.reshape(tf.convert_to_tensor(actions_prob), [len(states), 1])
            probability_ratio = tf.divide(tf.math.log(current_action_prob), tf.math.log(old_actions_prob))
        
            surrogate_arg_1 = tf.convert_to_tensor([probability_ratio[index]*advantages[index] for index in range(len(advantages))])
            surrogate_arg_2 = tf.convert_to_tensor(np.array([tf.keras.backend.clip(probability_ratio,1-self.epsilon,1+self.epsilon)[index]*advantages[index] for index in range(len(advantages))]).flatten())
            
            L = 0 - tf.minimum( surrogate_arg_1 , surrogate_arg_2 ) 
            loss = tf.reduce_mean(L)

        actor_weights = self.actor.trainable_variables
        grads = tape.gradient(loss, actor_weights)
        self.optimizer.apply_gradients(zip(grads, actor_weights))
        
        #update of the critic. The target is the TD error
        
        # extrinsic critic (rewards from the envirnoment)
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*extrinsic_TDerrors).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            all_v_values = self.extrinsic_critic(states)
            v_values = tf.reduce_sum(all_v_values*mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.MSE(target_v_values, v_values))
        grads = tape.gradient(loss, self.extrinsic_critic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.extrinsic_critic.trainable_variables))
        
        # intrinsic critic (rewards from the exploration)
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*intrinsic_TDerrors).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            all_v_values = self.intrinsic_critic(states)
            v_values = tf.reduce_sum(all_v_values*mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.MSE(target_v_values, v_values))
        grads = tape.gradient(loss, self.intrinsic_critic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.intrinsic_critic.trainable_variables))
        
        #since v changed we need to re-calculate the advantages
        #self.memory.calculate_advantages()
        
        keep = random.random()
        if (keep <= self.train_predictor_keeping_prob):
            self.rnd.train_predictor(og_states)
        
        return done
    
    def return_v_extrinsic_values(self, observation):
        v_e = self.extrinsic_critic.predict(tf.expand_dims(observation, axis=0))[0]
        return v_e
    
    def return_v_intrinsic_values(self, observation):
        v_i = self.intrinsic_critic.predict(tf.expand_dims(observation, axis=0))[0]
        return v_i
    
    def save():
        self.actor.save_weights(".")

## Training

In [6]:
class collect_trajectory(Thread):
    
    def __init__(self, environment, i_agent ):
         
        Thread.__init__(self)   
        self.n_agent = i_agent
        self.rewards = [] 
        
        self.basic_env = gym.make(environment, obs_type = "image")
        self.wrapped_env = AtariPreprocessing(self.basic_env)
        self.stack_env = FrameStack(self.wrapped_env, 4)
            
    def run(self):
        print("Starting {}".format(self.n_agent))
        
        observation = self.stack_env.reset()
        
        extrinsic_episode_reward = 0.0
        intrinsic_episode_reward = 0.0
        self.n_episodes = 0.0
        self.extrinsic_tot_reward = 0.0
        self.intrinsic_tot_reward = 0.0
        
        for i_step in range(N_STEPS):   
            observation, action, extrinsic_reward, intrinsic_reward, done, info = ppo.play_one_step(self.stack_env, observation, self.n_agent)

            #wrapped_env.render()
            extrinsic_episode_reward = extrinsic_episode_reward + extrinsic_reward
            intrinsic_episode_reward = intrinsic_episode_reward + intrinsic_reward
            
            #continuing task. if an episode is done we continue until complting the number of steps
            if (done):
                observation = self.stack_env.reset()
                
                self.extrinsic_tot_reward = self.extrinsic_tot_reward + extrinsic_episode_reward
                self.intrinsic_tot_reward = self.intrinsic_tot_reward + intrinsic_episode_reward
                self.n_episodes = self.n_episodes + 1
                episode_reward = 0.0
                
        if (self.n_episodes == 0):
            self.extrinsic_tot_reward = extrinsic_episode_reward
            self.intrinsic_tot_reward = intrinsic_episode_reward
        
        self.stack_env.close()
        
        if (self.n_episodes > 0):
            print("Exiting {} after {} episodes. Average ex reward: {} in reward: {}".format(self.n_agent, self.n_episodes, self.extrinsic_tot_reward/self.n_episodes, self.intrinsic_tot_reward/self.n_episodes))
        else:
            print("Exiting {} after {} episodes. Average ex reward: {} in reward: {}".format(self.n_agent, self.n_episodes, self.extrinsic_tot_reward, self.intrinsic_tot_reward))
    
    
    def get_reward_average(self):
        if (self.n_episodes > 0):
            return (self.extrinsic_tot_reward/self.n_episodes, self.intrinsic_tot_reward/self.n_episodes)
        else:
            return (self.extrinsic_tot_reward, self.intrinsic_tot_reward)
    

In [None]:
N_STEPS_NORMALIZATION = 50
N_EPOCHS = 100
N_EPISODES = 4 # in multi-agent this is the number of agents (each agnet collect 1 trajectory)
N_STEPS = 2400 # max number of step for each episode

TRAIN_STEPS = 150 # number of max steps done during training. if the number of samples is less than TRAIN_STEPS*BATCH_SIZE will stop early after completing the training on all the samples
BATCH_SIZE = 64

environment = "PongNoFrameskip-v4" # "GravitarNoFrameskip-v4", "AirRaidNoFrameskip-v4"

#env used to initialize the parameters inside PPO and RND
norm_env = gym.make(environment, obs_type = "image")
norm_wrapped_env = AtariPreprocessing(norm_env)
norm_stack_env = FrameStack(norm_wrapped_env, 4)

ppo = PPO(norm_stack_env, n_episodes = N_EPISODES, train_steps = TRAIN_STEPS, n_normalization_steps = N_STEPS_NORMALIZATION, i_lambda_par = 2.25, train_predictor_keeping_prob = 0.65)

e_rewards = []
i_rewards = []

for i_epoch in range(N_EPOCHS):
    extrinsic_epoch_reward = 0.0
    intrinsic_epoch_reward = 0.0
    agents = []
    for i_agent in range(N_EPISODES):
        agents.append(collect_trajectory(environment = environment, i_agent = i_agent))
    for agent in agents:
        agent.start()
    for agent in agents:
        agent.join()
        extrinsic_reward_average, intrinsic_reward_average = agent.get_reward_average()
        extrinsic_epoch_reward = extrinsic_epoch_reward + extrinsic_reward_average
        intrinsic_epoch_reward = intrinsic_epoch_reward + intrinsic_reward_average
    e_rewards.append(extrinsic_epoch_reward/N_EPISODES)
    i_rewards.append(intrinsic_epoch_reward/N_EPISODES)
    print("Epoch: {} ended with average extrinsic reward: {} intrinsic reward {} \n".format(i_epoch, extrinsic_epoch_reward/N_EPISODES, intrinsic_epoch_reward/N_EPISODES) )  
    ppo.train(batch_size = BATCH_SIZE)
    

Starting 0
Starting 1
Starting 2
Starting 3
Exiting 1 after 2.0 episodes. Average ex reward: 0.0 in reward: 6751.863908497523
Exiting 2 after 2.0 episodes. Average ex reward: 0.0 in reward: 24920.10628739869
Exiting 3 after 3.0 episodes. Average ex reward: 0.0 in reward: 13684.755940298253
Exiting 0 after 2.0 episodes. Average ex reward: 3.5 in reward: 7975.449099376853
Epoch: 0 ended with average extrinsic reward: 0.875 intrinsic reward 13333.04380889283 

STANDARD DEVIATION 11.512907438602369


  states, actions, actions_prob, e_rewards, i_rewards, advantages, e_TDerrors, i_TDerrors, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(9)]


Starting 0
Starting 1
Starting 2
Starting 3
Exiting 0 after 4.0 episodes. Average ex reward: 0.0 in reward: 109.57432681615715
Exiting 1 after 4.0 episodes. Average ex reward: 0.0 in reward: 108.12579255951286
Exiting 3 after 4.0 episodes. Average ex reward: 0.0 in reward: 112.74862643345311
Exiting 2 after 4.0 episodes. Average ex reward: 0.0 in reward: 107.91741626214282
Epoch: 1 ended with average extrinsic reward: 0.0 intrinsic reward 109.59154051781648 

STANDARD DEVIATION 8.932455337385417
Starting 0
Starting 1
Starting 2
Starting 3
Exiting 1 after 5.0 episodes. Average ex reward: 0.0 in reward: 92.47033072950268
Exiting 2 after 5.0 episodes. Average ex reward: 0.0 in reward: 93.23142719484608
Exiting 0 after 5.0 episodes. Average ex reward: 0.0 in reward: 96.41312040173348
Exiting 3 after 5.0 episodes. Average ex reward: 0.0 in reward: 93.19296293324082
Epoch: 2 ended with average extrinsic reward: 0.0 intrinsic reward 93.82696031483077 

STANDARD DEVIATION 7.499751789039346
Sta

Exception in thread Thread-29:
Traceback (most recent call last):
  File "D:\Programmi\Anaconda\envs\gputest\lib\threading.py", line 973, in _bootstrap_inner
    self.run()
  File "C:\Users\paci3\AppData\Local\Temp\ipykernel_15428\3921193840.py", line 26, in run
  File "C:\Users\paci3\AppData\Local\Temp\ipykernel_15428\2878981515.py", line 59, in play_one_step
  File "C:\Users\paci3\AppData\Local\Temp\ipykernel_15428\4173246151.py", line 69, in calculate_intrinsic_reward
  File "D:\Programmi\Anaconda\envs\gputest\lib\site-packages\keras\engine\training.py", line 1743, in predict
    self.predict_function = self.make_predict_function()
  File "D:\Programmi\Anaconda\envs\gputest\lib\site-packages\keras\engine\training.py", line 1562, in make_predict_function
    if self.predict_function is not None and not force:
AttributeError: 'Sequential' object has no attribute 'predict_function'


Exiting 1 after 4.0 episodes. Average ex reward: 0.0 in reward: 34.033111564515934
Exiting 3 after 4.0 episodes. Average ex reward: 0.0 in reward: 34.741937894220506
Exiting 2 after 4.0 episodes. Average ex reward: 0.0 in reward: 34.32482956564657
Epoch: 6 ended with average extrinsic reward: 0.0 intrinsic reward 32.6825144003928 

STANDARD DEVIATION 5.085691970539334
Starting 0
Starting 1
Starting 2
Starting 3
Exiting 1 after 4.0 episodes. Average ex reward: 0.0 in reward: 30.096235074386993
Exiting 2 after 4.0 episodes. Average ex reward: 0.0 in reward: 30.27078480284588
Exiting 3 after 4.0 episodes. Average ex reward: 0.0 in reward: 30.330081949532946
Exiting 0 after 4.0 episodes. Average ex reward: 0.0 in reward: 30.44978568809474
Epoch: 7 ended with average extrinsic reward: 0.0 intrinsic reward 30.28672187871514 

STANDARD DEVIATION 4.768347967911577
Starting 0
Starting 1
Starting 2
Starting 3
Exiting 3 after 4.0 episodes. Average ex reward: 0.0 in reward: 28.451304543394738
Exit

In [11]:
env = gym.make("GravitarNoFrameskip-v4", full_action_space = False) #, obs_type = "image")

env.action_space.n

18

## Plot graph

In [None]:
epochs = range(205)#N_EPOCHS)

plt.plot(epochs, i_rewards)

plt.xlabel("Epochs")
plt.ylabel("Rewards")

plt.show()