<a href="https://colab.research.google.com/github/floatoak/fundamentals_of_deep_learning/blob/main/9_deep_reinforcement_learning/policy_gradient_cartpole_tf2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing Pole-Cart with Policy Gradients in Tensorflow 2

In [68]:
import gym
import numpy as np
import random
import tensorflow as tf

from typing import Callable

## Creating an Agent
Define a class PolicyGradientNetwork to interact with the OpenAI environment, which contains
* Model architecture
* Model weights

In [69]:
class PolicyGradientNetwork(tf.keras.Model):
  """A policy gradient based reinformcement learning network."""
  def __init__(self, 
               state_size: int,
               num_actions: int,
               hidden_size: int):
    """Initializes the network architecture.

    Args:
      state_size: the number of values in a state.
      num_actions: the number of actions that the agent can take.
      hidden_size: the number of nodes in a hidden layer.
    """
    super(PolicyGradientNetwork, self).__init__()
    self.model = tf.keras.models.Sequential([
      tf.keras.layers.Dense(hidden_size, activation='relu'),
      tf.keras.layers.Dense(hidden_size, activation='relu'),
      tf.keras.layers.Dense(num_actions, activation='softmax')
    ])

  def call(self, state: tf.Tensor) -> tf.Tensor:
    """Forward propogates input through the network. 

    Args: 
      state: the current state of the agent of shape (num_actions,).

    Returns:
      The output Tensor of shape (num_actions,).
    """
    output = self.model(tf.convert_to_tensor(state))
    return output

## Sampling Actions
* Samples an action based on the model’s action probability distribution
* Supports greedy, e-greedy, annealed e-greedy 


In [70]:
def epsilon_greedy_action(action_distribution: tf.Tensor, 
                          epsilon: float=1e-1) -> int:
  """Picks an action based on e-greedy policy.

  Args:
    action_distribution: the agent's action distribution of shape
      (num_actions,).
    epsilon: a probability that decides whether to select an action randomly or 
      not.

  Returns: 
    the index of the action to be taken.
  """
  if random.random() < epsilon:
    return np.argmax(np.random.random(action_distribution.shape))
  else:
    return np.argmax(action_distribution)

In [71]:
def epsilon_greedy_action_annealed(action_distribution: tf.Tensor,
                                   percentage: float,
                                   epsilon_start: float=1.0,
                                   epsilon_end: float=1e-2) -> int:
  """Picks an action based on annealed e-greedy policy.
  
  Args: 
    action_distribution: the agent's action distribution of shape 
      (num_actions,).
    percentage: a probability between 0 and 1 that decides the progress of 
      epsilon change.
    epsilon_start: the starting probability that decides whether to select an 
      action randomly or not.
    epsilon_end: the end probability that decides whether to select an action 
      randomly or not.

  Returns: 
    the index of the action to be taken.
  """
  annealed_epsilon = epsilon_start * (1.0 - percentage) + \
                     epsilon_end * percentage
  if random.random() < annealed_epsilon:
    return np.argmax(np.random.random(action_distribution.shape))
  else:
    return np.argmax(action_distribution)

In [72]:
def predict_action(action_distribution: tf.Tensor, 
                   epsilon_percentage: float, 
                   explore_exploit_setting:str=
                    'epsilon_greedy_annealed_1.0->0.001') -> int:
  """Chooses an action based on the action probability distribution and an 
  explore vs. exploit policy.

  Args:
    action_distribution: the agent's action distribution of shape 
      (num_actions,).
    epsilon_percentage: a probability that decides the progress of epsilon 
      change.
    explore_exploit_setting: defines the action selection policy.

  Returns:
    the index of the action to be taken.
  """
  if explore_exploit_setting == 'greedy':
    action = epsilon_greedy_action(action_distribution)
  elif explore_exploit_setting == 'epsilon_greedy_0.05':
    action = epsilon_greedy_action(action_distribution, 0.05)
  elif explore_exploit_setting == 'epsilon_greedy_0.25':
    action = epsilon_greedy_action(action_distribution, 0.25)
  elif explore_exploit_setting == 'epsilon_greedy_0.50':
    action = epsilon_greedy_action(action_distribution, 0.50)
  elif explore_exploit_setting == 'epsilon_greedy_0.90':
    action = epsilon_greedy_action(action_distribution, 0.90)
  elif explore_exploit_setting == 'epsilon_greedy_annealed_1.0->0.001':
    action = epsilon_greedy_action_annealed(action_distribution, 
                                            epsilon_percentage, 
                                            1.0, 
                                            0.001)
  elif explore_exploit_setting == 'epsilon_greedy_annealed_0.5->0.001':
    action = epsilon_greedy_action_annealed(action_distribution, 
                                            epsilon_percentage, 
                                            0.5, 
                                            0.001)
  elif explore_exploit_setting == 'epsilon_greedy_annealed_0.25->0.001':
    action = epsilon_greedy_action_annealed(action_distribution, 
                                            epsilon_percentage, 
                                            0.25, 
                                            0.001)
  return action

## Defining Policy Gradient Loss

In [73]:
def policy_gradient_loss(outputs: tf.Tensor, 
                         actions: tf.Tensor, 
                         rewards: tf.Tensor) -> float:
  """Computes the policy gradient loss.

  Args: 
    outputs: the forward propogation output of shape (num_actions,).
    actions: the agent actions taken of shape (update_frequency,).
    rewards: the agent rewards of shape (update_frequency,).

  Returns:
    the policy gradient loss.
  """
  indices = tf.range(0, tf.shape(outputs)[0]) * tf.shape(outputs)[1] + actions
  responsible_outputs = tf.gather(tf.reshape(outputs, [-1]), indices)
  loss = -tf.reduce_mean(tf.math.log(responsible_outputs) * rewards)
  return loss

In [74]:
def train_step(model: Callable, 
               states: tf.Tensor, 
               actions: tf.Tensor, 
               rewards: tf.Tensor) -> float:
  """Performs policy gradient update for a given minibatch.

  Args:
    model: a callable network to perform forward progagation.
    states: the agent states of shape (state_size,) in a given minibath.
    actions: the agent actions taken of shape (update_frequency,).
    rewards: the agent rewards of shape (update_frequency,).

  Returns:
    the policy gradient loss.
  """
  with tf.GradientTape() as tape:
    outputs = model(states)
    loss = policy_gradient_loss(outputs, actions, rewards)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  return loss

## Using Discounted Rewards
Penalize for rewards that's taking more time steps by a factor of gamma per time step.

In [75]:
def discount_rewards(rewards: np.ndarray, 
                     gamma=0.98) -> np.ndarray:
  """Computes the discounted rewards by a factor of gamma.
  
  Args:
    rewards: rewards in a full episode.
    gamma: discount factor over time step.

  Returns:
    the discounted rewards.
  """
  discounted_returns = [0 for _ in rewards]
  discounted_returns[-1] = rewards[-1]
  for t in range(len(rewards) - 2, -1, -1):  # iterate backwards
      discounted_returns[t] = rewards[t] + discounted_returns[t + 1] * gamma
  return discounted_returns

## Keeping Track of History
Gradients are aggregated from multiple episodes, so it’s necessary to keep track of (state, action, reward) tuples

In [76]:
class EpisodeHistory(object):
  """Records an episode's (state, action, reward, discounted_returns) tuples."""
  def __init__(self):
    self.states = []
    self.actions = []
    self.rewards = []
    self.discounted_returns = []

  def add_to_history(self, 
                     state: np.ndarray, 
                     action: int, 
                     reward: float):
    """Appends each element to their corresponding lists, called per time step.

    Args: 
      state: the current state of the agent of shape (state_size,).
      action: the index of the action taken by the agent.
      reward: the reward get by taking this action
    """
    self.states.append(state)
    self.actions.append(action)
    self.rewards.append(reward)

In [77]:
class Memory(object):
  """Records flattened (state, action, reward, discounted_reward) tuples across 
  all episodes in the minibatch."""
  def __init__(self):
    self.states = []
    self.actions = []
    self.rewards = []
    self.discounted_returns = []

  def reset_memory(self):
    """Clears memory once a policy gradient descent is done on a minibatch."""
    self.states = []
    self.actions = []
    self.rewards = []
    self.discounted_returns = []

  def add_episode(self, episode: EpisodeHistory):
    """Appends each element in an episode to the corresponding list in memory.
    
    Args:
      episode: (state, action, reward, discounted_returns) tuples in an episode.
    """
    self.states += episode.states
    self.actions += episode.actions
    self.rewards += episode.rewards
    self.discounted_returns += episode.discounted_returns

## Policy Gradient Main Function

In [78]:
# Configure Settings
total_episodes = 5000      # number of independent episodes to train
epsilon_stop = 3000        # the max episode to increase the epsilon in annealed 
                           # e-greedy policy
update_frequency = 8       # number of episodes to update model parameters
max_episode_length = 500   # limit a single episode to be of finite length
learning_rate = 0.01
should_render = False
explore_exploit_setting = 'epsilon_greedy_annealed_1.0->0.001'
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]  # 4 for CartPole-v0
num_actions = env.action_space.n             # 2 for CartPole-v0
solved = False

# Declare Policy Gradient Networks
policy_gradient_model = PolicyGradientNetwork(state_size=state_size,
                                              num_actions=num_actions,
                                              hidden_size=16)
optimizer = tf.optimizers.Adam(learning_rate)

episode_rewards = []
batch_losses = []
global_memory = Memory()
steps = 0

# Start training
for i in range(total_episodes):
  state = env.reset()
  episode_reward = 0.0
  episode_history = EpisodeHistory()
  epsilon_percentage = float(min(i / float(epsilon_stop), 1.0))

  # Run a single episode
  for j in range(max_episode_length):
    # Probabilistically pick an action given our network output
    action_distribution = policy_gradient_model([state])
    action = predict_action(action_distribution, epsilon_percentage)
    # Get reward and next state
    next_state, reward, terminal, _ = env.step(action)

    if should_render:
      env.render()

    episode_history.add_to_history(state, action, reward)
    state = next_state
    episode_reward += reward
    steps += 1
    
    if terminal:
      # discounted_returns is updated separately only once every full episode, 
      # computed from the rewards.
      episode_history.discounted_returns = \
        discount_rewards(episode_history.rewards)
    
      global_memory.add_episode(episode_history)
      
      # Update model parameters every update_frequency episodes
      if i % update_frequency == 0 and i != 0:
        batch_loss = train_step(policy_gradient_model, 
                                global_memory.states, 
                                global_memory.actions, 
                                global_memory.discounted_returns)
        batch_losses.append(batch_loss)
        global_memory.reset_memory()

      episode_rewards.append(episode_reward)
      break

  # The problem is solved if per episode reward is greater than 100
  if i % 10 and i != 0:
    if np.mean(episode_rewards[:-100]) > 100.0:
      solved = True

  # Print last 100 episode's mean score
  if i % 100 == 0 and i != 0:
    print("Averaged rewards:", np.mean(episode_rewards[:-100]))


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Averaged rewards: 29.0
Averaged rewards: 21.346534653465348
Averaged rewards: 22.35323383084577
Averaged rewards: 23.524916943521596
Averaged rewards: 25.00498753117207
Averaged rewards: 26.65868263473054
Averaged rewards: 28.41098169717138
Averaged rewards: 29.823109843081312
Averaged rewards: 32.161048689138575
Averaged rewards: 34.283018867924525
Averaged rewards: 36.603396603396604
Averaged rewards: 39.01634877384196
Averaged rewards: 41.823480432972524
Averaged rewards: 45.562644119907766
Averaged rewards: 49.43540328336902
Averaged rewards: 54.01532311792138
Averaged rewards: 59.04122423485322
Averaged rewards: 64.19870664315108
Averaged rewards: 68.89838978345364
Averaged rewards: 74.62125197264598
Averaged rewards: 79.65717141429285
Averaged rewards: 85.00047596382674
Averaged rewards: 90.12812358019082
Averaged rewards: 94.89265536723164
Averaged rewards: 99.22157434402332
Averaged rewards: 103.23390643742503
Averaged rewards: 106.93848519800076
Averaged rewards: 110.380229544