# AirRaid-PPO implementation

PPO test in the AirRaid environment

In [1]:
import gym
from gym.wrappers import AtariPreprocessing
from gym.wrappers import FrameStack
import numpy as np
import tensorflow as tf 
from tensorflow import keras
from keras import backend as K
K.set_image_data_format('channels_first')
from collections import deque
import random
from matplotlib import pyplot as plt
from threading import Thread

## Preparing the environment

In [2]:
#basic_env = gym.make("GravitarNoFrameskip-v4", obs_type = "image")
#wrapped_env = AtariPreprocessing(basic_env)
#stack_env = FrameStack(wrapped_env, 4)

## Memory

Class used to memorize the trajectory and calculate the advntage

In [2]:
class Memory(object):
    
    STATE = 0
    ACTION = 1
    ACTION_PROB = 2
    REWARD = 3
    DONE = 4
    
    def __init__(self, n_trajectories, gamma = 0.4):
        self.trajectories = np.empty(n_trajectories, dtype=object)
        self.gamma = gamma
              
    def collect(self, state, action, action_prob, reward, done, i_episode):
        if (self.trajectories[i_episode] == None):
            self.trajectories[i_episode] = deque(maxlen=N_STEPS)
        self.trajectories[i_episode].append((state, action, action_prob, reward, done))
        
    def calculate_advantages(self):
        advantages = []
        TDerrors = [] #list of all the delta, used to uopdate the critic
        
        for trajectory in self.trajectories:
            
            advantage_trajectory = [] #list of advantages for each element in a single trajectory
            discounted_return = []

            discounted_return.append(trajectory[-1][self.REWARD])
            old_advantage = trajectory[-1][self.REWARD] - ppo.return_v_values(trajectory[-1][self.STATE]) 
            advantage_trajectory.append(old_advantage)

            for i in range(len(trajectory)-2,-1,-1):
                discounted_return.append(trajectory[i][self.REWARD] + self.gamma*ppo.return_v_values(trajectory[i+1][self.STATE]))
                new_advantage = discounted_return[-1] - ppo.return_v_values(trajectory[-1][self.STATE]) + self.gamma*old_advantage

                advantage_trajectory.append(new_advantage[0])   

                old_advantage = new_advantage

            advantages.append(advantage_trajectory)
            TDerrors.append(discounted_return)
            
        #flat all trajectories in a single deque adding the advantages (easier to sample random batches)
        self.flat_trajectories(advantages, TDerrors)
    
    def flat_trajectories(self, advantages, TDerrors):
        
        size = 0
        for trajectory in self.trajectories:
            size = size + len(trajectory)
        self.flatten_trajectories = deque(maxlen=size)
        
        for trajectory, advantage, delta in zip(self.trajectories, advantages, TDerrors):
            for i in range(len(trajectory)-1, -1, -1):
                self.flatten_trajectories.append((trajectory[i][self.STATE], 
                                                  trajectory[i][self.ACTION], 
                                                  trajectory[i][self.ACTION_PROB], 
                                                  trajectory[i][self.REWARD], 
                                                  advantage[len(trajectory)-1-i], #they are reversed in respect to the trajectory
                                                  delta[len(trajectory)-1-i], 
                                                  trajectory[i][self.DONE]))
        
        
    #pick a random batch example from the flatten list of trajectories
    def sample_experiences(self, batch_size):
        if (len(self.flatten_trajectories) >= batch_size):
            indices = np.random.permutation(len(self.flatten_trajectories))[:batch_size]
        else:
            indices = np.random.permutation(len(self.flatten_trajectories))
        batch = [self.flatten_trajectories[index] for index in indices]
        #delete form the memory the used obervations
        for index in sorted(indices, reverse=True):
            del self.flatten_trajectories[index]
        states, actions, actions_prob, rewards, advantages, TDerrors, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(7)]
        return states, actions, actions_prob, rewards, advantages, TDerrors, dones
        
    def reset(self):
        for trajectory in self.trajectories:
            trajectory.clear()

# PPO class

$ s_{t+1} $ is the observed state after the current action $ a_t $ 

Actor update formula:
$ \theta_{t+1} = \theta_t + \alpha\nabla min(r_t(\theta)\hat{A}_t, clip(r_t(\theta),1-\epsilon,1+\epsilon)\hat{A}_t)$

Critic update formula:
$ w_{t+1} = w_t + \alpha G_t\nabla\hat{v}(s_t,w)$

Probability ratio $ r_t(\theta) \doteq $
$ \pi_\theta(a_t | s_t) \over \pi_{\theta_old}(a_t | s_t) $

Advantage:
$ \hat{A}_t \doteq \delta_t + (\gamma\lambda)\delta_{t+1} + (\gamma\lambda)^2\delta_{t+2} + ... + (\gamma\lambda)^{T-t+1}\delta_{T-1} = \delta_t + (\gamma\lambda)\hat{A}_{t+1}$

TDerror:
$ \quad \delta_t  \doteq $
$ G_t - \hat{v}(s_t,w) $ $ \qquad $ (if $ s_{t+1} $ is terminal then $ \hat{v}(s_{t+1},w) = 0$)

Discounted return:
$ G_t \doteq $
$ r_t + \gamma\hat{v}(s_{t+1},w) $

In [12]:
class PPO(object):
    
    input_shape = [4,84,84] 
    n_outputs = 4 #stack_env.action_space.n
    
    def __init__(self, n_episodes = 1, train_steps = 100, epsilon = 0.2, alpha = 0.95):
        self.actor = self.create_actor()
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.critic = self.create_critic()
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        
        self.epsilon = epsilon
        self.alpha = alpha
        
        self.train_steps = train_steps
        
        self.memory = Memory(n_episodes)
        
    #create the NN of the actor
    # Given the state returns the probability of each action
    def create_actor(self):    
        actor = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(self.n_outputs, activation = 'softmax') ])
        return actor
       
    #create the NN of the critic
    # Given the state returns the value function
    def create_critic(self):
        critic = keras.Sequential([
            keras.layers.Conv2D(filters=32, kernel_size = (8,8), strides=4, activation="relu", input_shape = self.input_shape),
            keras.layers.Conv2D(filters=64, kernel_size = (4,4), strides=2, activation="relu"),
            keras.layers.Conv2D(filters=64, kernel_size = (3,3), strides=1, activation="relu"),
            keras.layers.Dense(512),
            tf.keras.layers.Flatten(),
            keras.layers.Dense(1) ])
        
        self.critic_loss_fn = tf.keras.losses.mean_squared_error
        
        return critic
      
    def play_one_step(self, env, observation, i_episode):
        action, action_prob = self.select_action(observation)
        observation, reward, done, info = env.step(action)
        
        #put in wrapper
        #reward = float(reward)/100.
        
        self.memory.collect(observation, action, action_prob, reward, done, i_episode)
        return observation, action, reward, done, info
        
    #select the action
    def select_action(self, observation):
        
        action_probabilities = self.actor.predict(tf.expand_dims(np.array(observation) / 255, axis=0))[0]
        
        #choosing an action usign randomly using a "roulette wheel" approach
        r = random.random()
        sum_probabilities = 0
        for i in range(len(action_probabilities)):
            sum_probabilities = sum_probabilities + action_probabilities[i]
            
            if (r <= sum_probabilities):
                action = i
                break
        
        return action, action_probabilities[action]
    
    def train(self, batch_size):
        self.memory.calculate_advantages()
        
        for i_step in range(self.train_steps):
            done = self.training_step(batch_size)
            if (done):
                break
        
        self.memory.reset()
        
    #training done on the memory (the advantages must be calculated before hand)
    def training_step(self, batch_size):
        #get experiences (parts of a trajectory) from the memory
        states, actions, actions_prob, rewards, advantages, TDerrors, dones = self.memory.sample_experiences(batch_size)
        
        done = False
        if (len(states) != batch_size):
            done = True
        
        #compute the values for the update of the actor
        
        mask = tf.one_hot(actions, self.n_outputs)

        states = np.array(states) / 255
        #array of shape (64,) into array of shape (64,1)
        #states =  np.array(np.array_split(states, len(states)))
        
        with tf.GradientTape() as tape:
            current_actions_prob = self.actor(states)
            
            current_action_prob = tf.reduce_sum(current_actions_prob*mask, axis=1, keepdims=True)
            old_actions_prob = tf.reshape(tf.convert_to_tensor(actions_prob), [len(states), 1])
            probability_ratio = tf.divide(tf.math.log(current_action_prob), tf.math.log(old_actions_prob))
        
            surrogate_arg_1 = tf.convert_to_tensor([probability_ratio[index]*advantages[index] for index in range(len(advantages))])
            surrogate_arg_2 = tf.convert_to_tensor(np.array([tf.keras.backend.clip(probability_ratio,1-self.epsilon,1+self.epsilon)[index]*advantages[index] for index in range(len(advantages))]).flatten())
            
            L = 0 - tf.minimum( surrogate_arg_1 , surrogate_arg_2 ) 
            loss = tf.reduce_mean(L)

        actor_weights = self.actor.trainable_variables
        grads = tape.gradient(loss, actor_weights)
        self.actor_optimizer.apply_gradients(zip(grads, actor_weights))
        
        #update of the critic. We need the target is the TD error
        target_v_values = tf.reshape(tf.convert_to_tensor(np.asarray(self.alpha*TDerrors).astype('float32')), (len(states), 1))

        with tf.GradientTape() as tape:
            all_v_values = self.critic(states)
            v_values = tf.reduce_sum(all_v_values*mask, axis=1, keepdims=True)
            loss = tf.reduce_mean(self.critic_loss_fn(target_v_values, v_values))
        grads = tape.gradient(loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))
        
        return done
    
    def return_v_values(self, observation):
        v = self.critic.predict(tf.expand_dims(np.array(observation) / 255, axis=0))[0]
        return v
    
    def save(self, path = ".\\saved_weights\\ppo\\"):
        self.actor.save_weights(path + 'actor_weights.h5')
        self.critic.save_weights(path + 'critic_weights.h5')
        
    def load(self, path = ".\\saved_weights\\ppo\\"):
        self.actor.load_weights(path + 'actor_weights.h5')
        self.critic.load_weights(path + 'critic_weights.h5')

## Training

In [13]:
class collect_trajectory(Thread):
    
    def __init__(self, environment, i_agent):
         
        Thread.__init__(self)   
        self.n_agent = i_agent
        self.rewards = [] 
        
        self.basic_env = gym.make(environment, obs_type = "image")
        self.wrapped_env = AtariPreprocessing(self.basic_env)
        self.stack_env = FrameStack(self.wrapped_env, 4)
        
    def run(self):
        print("Starting {}".format(self.n_agent))
        
        observation = self.stack_env.reset()
        
        self.episode_reward = 0.0
        
        for i_step in range(N_STEPS):   
            observation, action, reward, done, info = ppo.play_one_step(self.stack_env, observation, self.n_agent)

            self.episode_reward = self.episode_reward + reward
            
            #continuing task. if an episode is done we continue until complting the number of steps
            if (done):
                break
        
        self.stack_env.close()
        
        print("Exiting {} afte n. steps {}. Tot reward: {}".format(self.n_agent, i_step, self.episode_reward))
    
    def get_reward(self):
        return self.episode_reward

In [None]:
N_EPOCHS = 100
N_EPISODES = 4 # in multi-agent this is the number of agents (each agnet collect 1 trajectory)
N_STEPS = 900 # max number of step for each episode

TRAIN_STEPS = 100 # number of max steps done during training. if the number of samples is less than TRAIN_STEPS*BATCH_SIZE will stop early after completing the training on all the samples
BATCH_SIZE = 32

#env used to initialize the parameters inside PPO and RND
environment = "BreakoutNoFrameskip-v4" # "AirRaidNoFrameskip-v4", "GravitarFrameskip-v4"

ppo = PPO(n_episodes = N_EPISODES, train_steps = TRAIN_STEPS)

rewards = []

for i_epoch in range(N_EPOCHS):
    epoch_reward = 0.0
    agents = []
    for i_agent in range(N_EPISODES):
        agents.append(collect_trajectory(environment = environment, i_agent = i_agent))
    for agent in agents:
        agent.start()
    for agent in agents:
        agent.join()
        agent_reward = agent.get_reward()
        epoch_reward = epoch_reward + agent_reward
    rewards.append(epoch_reward/N_EPISODES)
    print("Epoch: {} ended with average reward: {}\n".format(i_epoch, epoch_reward/N_EPISODES))  
    ppo.train(batch_size = BATCH_SIZE)
    ppo.save()

Starting 0
Starting 1
Starting 2
Starting 3


In [10]:
basic_env = gym.make(environment, obs_type = "image")
wrapped_env = AtariPreprocessing(basic_env)
stack_env = FrameStack(wrapped_env, 4)

observation = stack_env.reset()
observation, reward, done, info = stack_env.step(1)
print(observation)
print("Starting demo")
for i_step in range(100):   
    
    action, action_prob = ppo.select_action(observation)
    observation, reward, done, info = stack_env.step(action)
    
    print("selected action {} with prob {} got reward {}".format(action, action_prob, reward))

    stack_env.render()
            
    if (done):
        break
        
stack_env.close()

<gym.wrappers.frame_stack.LazyFrames object at 0x000001BAAE103220>
Starting demo
selected action 0 with prob 0.9999996423721313 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999997615814209 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999996423721313 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999996423721313 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999995231628418 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999996423721313 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999996423721313 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999995231628418 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999995231628418 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999996423721313 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999995231628418 got reward 0.0
{'ale.lives': 5}
selected action 0 with prob 0.9999996423721313 got

## Plot graph

In [None]:
epochs = range(N_EPOCHS)

plt.plot(epochs, rewards)

plt.xlabel("Epochs")
plt.ylabel("Rewards")

plt.show()