# Setup
Run below cells and hide it afterwards with the arrow on the left. 

In [0]:
!pip install gym[Box2D] pyvirtualdisplay pyglet > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [2]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import math
import glob
import io
import base64
from IPython.display import HTML

from typing import List, Tuple

import torch
from torch import nn
import torch.nn.functional as F
from collections import deque

from IPython import display as ipythondisplay
from IPython.display import display, update_display, clear_output
from time import sleep

from pyvirtualdisplay import Display
xdisplay = Display(visible=0, size=(1300, 900), backend="xvfb")
xdisplay.start()


"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

class DoneWrapper(gym.Wrapper):

  def step(self, action):
    observation, reward, done, info = self.env.step(action) 
    return observation, reward, False, info
      

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    
    
def wrap_env(env, done=True):
  if not done:
    env = DoneWrapper(env)
  env = Monitor(env, './video', force=True, mode='evaluation')
  return env


def print_ansi(screen, display_id='42', wait=0.5):
    clear_output(wait=True)
    update_display(print(screen.getvalue()), display_id=display_id)
    sleep(wait)


def plot(img):
  fig = plt.figure(figsize=(8,6))
  ax = fig.add_subplot(111)
  ax.imshow(img)
  ax.set_xticks([])
  ax.set_yticks([])

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


In [0]:
def gather_trajectories(env: gym.Env, policy, num_trajs: int = 10):
    """Gather `num_trajs` trajectories by interacting with the environment using the given policy."""
    
    # preapre a list for the trajectories
    history = []
    
    for traj_idx in range(num_trajs):
        obs = env.reset()
        done = False
        current_traj = []
        while not done:
            
            # sample an action from the policy
            action = policy.sample(obs)
            # feed it into the environment
            next_obs, reward, done, _ = env.step(action)
            
            # save into the history
            current_traj += [(obs, action, reward)]

            obs = next_obs
        history += [current_traj]
        
    return history

def calculate_return(rewards: List[float]) -> Tuple[float, List[float]]:
    """Calulated and episode and step returns"""
    # calculate the sum of rewards from the episode
    rewards = np.array(rewards)
    episode_return = np.sum(rewards)
    
    # prepare a list for the step returns
    step_returns = []

    # calculate discounted return for each step
    # hint: it's easier to go backwards

    step_returns = [rewards[-1]]
    for reward in reversed(rewards[:-1]):
        last_return = step_returns[-1]
        step_returns += [reward + last_return]
    step_returns.reverse()

    return episode_return, step_returns

def process_trajectories(history: List):
    """Process gathered trajectories into tensors and calculate returns"""
    # prepare containers for each element
    obs_array = []
    action_array = []
    return_array = []
    episode_returns = []
    
    # loop over the whole history
    rewards = []
    for traj_idx, traj in enumerate(history):
        # unpack the elements
        traj_obs, traj_actions, traj_rewards = list(zip(*traj))

        # process the end of an episode - calculate episode and step returns

        episode_return, step_returns = calculate_return(traj_rewards)
        
        episode_returns += [episode_return]
        obs_array += traj_obs
        action_array += traj_actions
        return_array += step_returns

    # cast out data to tensors (will be useful later)     
    obs_array = torch.tensor(obs_array, dtype=torch.float32)
    action_array = torch.tensor(action_array, dtype=torch.float32)
    return_array = torch.tensor(return_array, dtype=torch.float32)
    episode_returns = torch.tensor(episode_returns, dtype=torch.float32)
    
    return obs_array, action_array, return_array, episode_returns

def visualize(env, policy):
    """Run the provided policy on the environment"""

    env = wrap_env(env)
    obs = env.reset()
    done = False
    
    while not done:
        action = policy.sample(obs) # ???
        obs, reward, done, _ = env.step(action)
        env.render()

    env.close()
    show_video()

# Part 2. Policy Gradient

Here will implement the Policy Gradient algorithm and its necessary components.

But before we start we need to add the discount factor trick to out trajectory processing function from the last part.

## Exercise: Discounting 
Add the discount factor to the episode and step returns calculation.
We'll set the default value for the gamma parameter so we wouldn't have to change the `process_trajectories` function.



In [0]:
def calculate_return(rewards: List[float], gamma: float = 1.0):
    
    # calculate the *discounted* sum of rewards from the episode
    rewards = np.array(rewards)
    # ???
    gammas = gamma ** np.arange(len(rewards))
    episode_return = np.sum(rewards * gammas)
    
    step_returns = []

    # calculate discounted return for each step
    # hint: it's easier to go backwards
    step_returns = [rewards[-1]]
    for reward in reversed(rewards[:-1]):
        last_return = step_returns[-1]
        step_returns += [reward + last_return * gamma] # ???
    step_returns.reverse()

    return episode_return, step_returns


## Exercise: Network Policy
For our Policy Gradient (and further methods) we need a differentiable policy model with optimizable weights - a *Network Policy*.

The main specification of the network is already implemented, your task is to add the necessary methods. Same as before, we need our policy to provide us with:
* a method to sample action given an observation - `sample` method
* a probability vector, like the RandomPolicy - `probs` method.
* and additional, a `log_probs` method that returns the same probabilities as the `probs` method, but passed through a logarithm function. It will be useful, since in the formula for policy gradient update we use the logarithm of probability instead of the probability itself.

In [0]:
class NetworkPolicy(nn.Module):

    def __init__(self, obs_dim: int, action_dim: int, h_dim: int = 16):
        super(NetworkPolicy, self).__init__()

        self.model = nn.Sequential(nn.Linear(obs_dim, h_dim),
                                   nn.Tanh(),
                                   nn.Linear(h_dim, action_dim))

    def probs(self, obs):
        # cast the numpy array to a torch tensor if necessary
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float32)
        # get logits from the model
        logits = self.model(obs) # ???
        # use softmax function to transform logits into probability distribution
        return F.softmax(logits, -1) # ???

    def log_probs(self, obs: np.ndarray):
        # cast the numpy array to a torch tensor if necessary
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float32)
        # get logits from the model
        logits = self.model(obs) # ???
        # use *log* softmax function to transform logits into probability distribution
        log_probs = F.log_softmax(logits, -1) # ???
        return log_probs

    def sample(self, obs):
        # again, sample from the prepared probability vector 
        # remember the `.item()` method!
        # ???

        probs = self.probs(obs)
        action = torch.multinomial(probs, 1).item()
        return action

## Exercise: Policy Gradient Step and Training

Now that we have the necessary elements, we can implement the Policy Gradient itself. Let's start with a single PG step. Your task is to implement the `target` of the Policy Gradient, i.e. the function that we want to optimize.

Here's the gradient equation as a remainder:

$$ \nabla_\theta \mathbb{E}_{\tau \sim \pi} R(\tau) \approx \frac{1}{N} \sum_{j=1}^{N} \sum_{t=1}^{T} \nabla_\theta \log \pi_\theta (a_t | s_t) R_t = \nabla_\theta \frac{1}{N} \sum_{j=1}^{N} \sum_{t=1}^{T}  \log \pi_\theta (a_t | s_t) R_t = \nabla_\theta \hat{J}$$ 
where action $a_t$, state $s_t$ and step return $R_t$ come from trajectory $\tau_j$ .

PyTorch optimizers by default **minimize** the given function, so your target should be in fact the negative of the loss above (i.e. $-\hat{J}$).

The [`tensor.gather`](https://pytorch.org/docs/stable/torch.html#torch.gather) method may be useful to get the action log-probabilities.


In [0]:
def policy_gradient_step(policy: NetworkPolicy,
                         optimizer: torch.optim.Optimizer, 
                         obs: torch.Tensor, 
                         actions: torch.Tensor, 
                         step_returns: torch.Tensor,
                         num_trajs: int):

    # pass the obs to the policy to get log probabilities of each action
    log_probs = policy.log_probs(obs) # ???
    
    # get the probability of the action thast was actually performed for each observation
    actions = actions.view(-1, 1).long()
    # actions = actions.long()
    action_log_probs = log_probs.gather(1, actions).squeeze() # ???
    #action_log_probs = torch.index_select(log_probs, 0, actions)
    #action_log_probs = log_probs[:,actions]
    # calculate the gradient
    target = -(action_log_probs * step_returns).sum() / num_trajs # ???
    # pass it to the optimizer
    optimizer.zero_grad()
    target.backward()
    optimizer.step()



The last missing thing is the training loop that uses all the stuff that we have implemented so far. Write the missing code using the functions and classes that you've implemented already.

In [0]:
def train_policy_gradient(env: gym.Env, 
                          policy: torch.nn.Module, 
                          num_iterations: int = 100, 
                          trajs_per_gather: int = 10):

    # we'll use Adam to update the weights of our network
    optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
    # training loop
    for idx in range(num_iterations + 1):
        # gather trajectories using current policy
        history = gather_trajectories(env, policy, num_trajs=trajs_per_gather) # ???
        
        # calculate the obs, actions and returns array by processing the trajectories
        obs, actions, step_returns, ep_returns = process_trajectories(history) # ???

        # policy gradient training
        policy_gradient_step(policy=policy,
                                optimizer=optimizer,
                                obs=obs,
                                actions=actions,
                                step_returns=step_returns,
                                num_trajs=trajs_per_gather)
        # log training progress
        if idx % 10 == 0:
            print(f"Traning iteration {idx}, mean episode returns: {ep_returns.mean():.3f}")

In [19]:
# moon lander
# env = gym.make("LunarLander-v2")
# cart pole
env = gym.make("CartPole-v1")

# gather necessary dimensions for our netowrk
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
# initialize the policy
network_policy = NetworkPolicy(obs_dim, action_dim)

# train the model
train_policy_gradient(env, 
                      network_policy, 
                      num_iterations=100,
                      trajs_per_gather=20)

Traning iteration 0, mean episode returns: 16.750
Traning iteration 10, mean episode returns: 16.250
Traning iteration 20, mean episode returns: 23.350
Traning iteration 30, mean episode returns: 29.700
Traning iteration 40, mean episode returns: 43.000
Traning iteration 50, mean episode returns: 44.500
Traning iteration 60, mean episode returns: 51.100
Traning iteration 70, mean episode returns: 70.450
Traning iteration 80, mean episode returns: 72.250
Traning iteration 90, mean episode returns: 198.950
Traning iteration 100, mean episode returns: 348.250


In [8]:
visualize(env, network_policy)

# Bonus Exercise 1
Try to find an architecture for the agent that allows for the fastest training for:
1. CartPole
2. LunarLander

Are the best architectures the same for both environments?

## Bonus Exercise 2

It may be tempting to reuse the data we already have. Try to modify `train_policy_gradient` function in order to modify our parameters using the same trajectories multiple times (e.g. call `train_policy_gradient` method 100 times). Does it work? If not, can you think of the reason why?