# Policy Gradient methods

We will start with the standard policy gradient algorithm. This is a batch algorithm, which means that we will collect a large number of samples per iteration, and perform a single update to the policy using these samples. Recall that the formula for policy gradient is given by

$$\nabla_{\theta}\mathbb{E}_{\pi_{\theta}}\Big[ \sum_{t=0}^T\gamma^t r_t \Big] = 
\mathbb{E}_{\pi_{\theta}}\Big[ \sum_{t=0}^T \nabla_{\theta} \log\pi_{\theta}(a_t|s_t)\big(R_t - b(s_t)\big) \Big]$$

- $\pi_{\theta}$ is a stochastic policy parameterized by $\theta$;
- $\gamma$ is the discount factor;
- $s_t$, $a_t$ and $r_t$ are the state, action, and reward at time $t$;
- $T$ is the length of a single episode;
- $b(s_t)$ is any funcion which does not depend on the current action $a_t$, and is called baseline;
- $R_t$ is the discounted cumulative future return (already defined in the DQN exercise);
Instead of optimizing this formula, we will optimize a sample-based estimation of the expectation, based on $N$ trajectories. For this you will first implement a function that computes $\log\pi_{\theta}(a_t|s_t)$ given any $s,~a$. 

In [1]:
#!/usr/bin/env python
import numpy as np
import gym
from simplepg.simple_utils import gradient_check, log_softmax, softmax, weighted_sample, include_bias, test_once, nprs
import tests.simplepg_tests

## Constructing a stochastic policy

Let's assume that $\pi_{\theta}$ is a Gaussian with unit variance $\Sigma=I$ and mean $\mu=NN_{\theta}(s)$, where $NN_{\theta}$ is a Neural Network parameterized by $\theta$.

### 1. Create a Linear NN
Single layer, no non-linearity.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd

class NN(nn.Module):
    def __init__(self, obs_size, act_size):
        super(NN, self).__init__()
        self.Linear1 = nn.Linear(obs_size, 256)
        self.Linear2 = nn.Linear(256, 256)
        self.Linear3 = nn.Linear(256, act_size)

    def forward(self, obs):
        out = F.relu(self.Linear1(obs))
        out = F.relu(self.Linear2(out))
        out = self.Linear3(out) 
        return out
    
class NN_linear(nn.Module):
    def __init__(self, obs_size, act_size):
        super(NN_linear, self).__init__()
        self.Linear = nn.Linear(obs_size, act_size)

    def forward(self, obs):
        out = self.Linear(obs)
        return out

In [3]:
class GaussianMLP_Policy(object):
    def __init__(self, obs_size, act_size, NN, ):
        self.NN = NN(obs_size, act_size)
        
    def get_action(self, obs, rng=np.random):
        obs_var = autograd.Variable(torch.Tensor(obs), requires_grad=False)
        mean = self.NN.forward(obs_var)
        return rng.normal(loc=mean.data.numpy(), scale=1.)
    
    def get_logp_action(self, obs, action):
        mean = self.NN.forward(obs)
        return float(-0.5 * np.log(2 * np.pi) * action.shape[-1]) - 0.5 * torch.sum((action - mean) ** 2, dim=-1)
    
    def get_grad_logp_action(self, obs, action):
        logp = self.get_logp_action(obs, action)
        logp.backward()
        return [p.grads() for p in list(self.NN.parameters())]
    

In [4]:
##############################
# Methods for Point-v0
##############################

def point_get_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: A vector of size |A|
    :return: A scalar
    """
    ob_1 = include_bias(ob) ########### todo: remove this!!
    mean = theta.dot(ob_1)  ########### todo: replace by NN!!
    zs = action - mean
    return -0.5 * np.log(2 * np.pi) * theta.shape[0] - 0.5 * np.sum(np.square(zs))  ### the output here should be symb!


def point_get_grad_logp_action(theta, ob, action):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :param action: A vector of size |A|
    :return: A matrix of size |A| * (|S|+1)
    """
    grad = np.zeros_like(theta)
    # BEGIN SOLUTION
    ob_1 = include_bias(ob) ########### todo: remove this!!
    grad = np.outer(action - theta.dot(ob_1), ob_1) ########### todo: replace by autodiff
    # END SOLUTION
    return grad


def point_get_action(theta, ob, rng=np.random):
    """
    :param theta: A matrix of size |A| * (|S|+1)
    :param ob: A vector of size |S|
    :return: A vector of size |A|
    """
    ob_1 = include_bias(ob)
    mean = theta.dot(ob_1)
    return rng.normal(loc=mean, scale=1.)


def point_test_grad_impl():
    # check gradient implementation
    rng = nprs(42)
    test_ob = rng.uniform(size=(4,))
    test_act = rng.uniform(size=(4,))
    test_theta = rng.uniform(size=(4, 5))
    # Check that the shape matches
    assert point_get_grad_logp_action(test_theta, test_ob, test_act).shape == test_theta.shape
    gradient_check(
        lambda x: point_get_logp_action(x.reshape(test_theta.shape), test_ob, test_act),
        lambda x: point_get_grad_logp_action(x.reshape(test_theta.shape), test_ob, test_act).flatten(),
        test_theta.flatten()
    )

In [5]:
def compute_baselines(all_returns):
    """
    :param all_returns: A vector of size T
    :return: A vector of size T
    """
    baselines = np.zeros(len(all_returns))
    for t in range(len(all_returns)):
        # BEGIN SOLUTION
        # Update the baselines
        if len(all_returns[t]) > 0:
            baselines[t] = np.mean(all_returns[t])
        else:
            baselines[t] = 0.
        # END SOLUTION
    return baselines

In [6]:
rng = np.random.RandomState(42)
point_test_grad_impl()

from simplepg import point_env
env = gym.make('Point-v0')
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

policy = GaussianMLP_Policy(obs_dim, action_dim, NN_linear)

[2018-04-06 19:32:44,194] Making new env: Point-v0


Gradient check passed!


In [7]:
params = [p.data.numpy() for p in policy.NN.parameters()]

In [8]:
theta = np.concatenate([params[0], params[1].reshape(-1, 1)], axis=1)

In [9]:
# just to check the computations

# get_action = point_get_action
get_action = policy.get_action

get_grad_logp_action = point_get_grad_logp_action

env.seed(42)
timestep_limit = env.spec.timestep_limit

# Initialize parameters  ###### now taking the NN initialization
# theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1))

# Store baselines for each time step.
baselines = np.zeros(timestep_limit)

In [10]:
def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action):
    """
    :param discount: A scalar
    :param R_tplus1: A scalar
    :param theta: A matrix of size |A| * (|S|+1)
    :param s_t: A vector of size |S|
    :param a_t: Either a vector of size |A| or an integer, depending on the environment
    :param r_t: A scalar
    :param b_t: A scalar
    :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a 
    matrix of size |A| * (|S|+1) )
    :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1)
    """
    R_t = 0.
    grad_t = np.zeros_like(theta)
    # BEGIN SOLUTION
    R_t = r_t + discount * R_tplus1
    grad_t = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t)
    # END SOLUTION
    return R_t, grad_t

def compute_centered_returns(discount, rewards, baseline):
    centered_cum_returns = np.zeros_like(rewards)
    R_tplus1 = 0
    for t in reversed(range(len(rewards))):
        R = rewards[t] + discount * R_tplus1
        centered_cum_returns[t] = R - baseline[t]
        R_tplus1 = R
    return centered_cum_returns

In [11]:
n_itrs = 100
batch_size = 2000
discount = 0.99
learning_rate = 0.1
render = False  # True
natural_step_size = 0.01

# Policy training loop
for itr in range(n_itrs):
    # Collect trajectory loop
    n_samples = 0
    grad = np.zeros_like(theta)
    policy.NN.zero_grad()
    episode_rewards = []

    # Store cumulative returns for each time step
    all_returns = [[] for _ in range(timestep_limit)]

    all_observations = []
    all_actions = []
    all_centered_cum_rews = []

    while n_samples < batch_size:
        observations = []
        actions = []
        rewards = []
        ob = env.reset()
        done = False
        # Only render the first trajectory
        render_episode = n_samples == 0
        # Collect a new trajectory
        while not done:
#             action = get_action(theta, ob, rng=rng)
            action = get_action(ob, rng=rng)
            next_ob, rew, done, _ = env.step(action)
            observations.append(ob)
            actions.append(action)
            rewards.append(rew)
            ob = next_ob
            n_samples += 1
            if render and render_episode:
                env.render()
                
        # Go back in time to compute returns and accumulate gradient
        # Compute the gradient along this trajectory
        
        R = 0.
        centered_cum_rews = np.zeros(np.shape(observations)[0])
        
        for t in reversed(range(len(observations))):
            # Test the implementation, but only once
            test_once(compute_update)

            R, grad_t = compute_update(
                discount=discount,
                R_tplus1=R,
                theta=theta,
                s_t=observations[t],
                a_t=actions[t],
                r_t=rewards[t],
                b_t=baselines[t],
                get_grad_logp_action=get_grad_logp_action
            )
            centered_cum_rews[t] = R - baselines[t]
            all_returns[t].append(R)
            grad += grad_t

        centered_cum_rews = compute_centered_returns(discount, rewards, baselines)
#         ccr = compute_centered_returns(discount, rewards, baselines)
#         print(np.sum(ccr - centered_cum_rews))

        episode_rewards.append(np.sum(rewards))
        all_observations.extend(observations)
        all_actions.extend(actions)
        all_centered_cum_rews.extend(centered_cum_rews)
    
    # autodiff loss
    obs_vars = autograd.Variable(torch.Tensor(all_observations), requires_grad=False)
    act_vars = autograd.Variable(torch.Tensor(all_actions), requires_grad=False)
    centered_cum_rews_vars = autograd.Variable(torch.Tensor(all_centered_cum_rews), requires_grad=False)
    
    logps = policy.get_logp_action(obs_vars, act_vars)
         
    surr_loss = torch.dot(logps, centered_cum_rews_vars)
    surr_loss.backward()
    
    flat_grad = np.concatenate([p.grad.data.numpy().reshape((-1)) for p in policy.NN.parameters()])
    grad_norm = np.linalg.norm(flat_grad)
    
    for p in policy.NN.parameters():
#         print("current data: {}, grad: {}, update: {}".format(p.data, p.grad.data, learning_rate * p.grad.data / (grad_norm + 1e-8)))
        p.data += learning_rate * p.grad.data / (grad_norm + 1e-8)
    
    # Roughly normalize the gradient
    norm_grad = grad / (np.linalg.norm(grad) + 1e-8)

    theta += learning_rate * norm_grad
        
    test_once(compute_baselines)

    baselines = compute_baselines(all_returns)
    
    print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % (
    itr, np.mean(episode_rewards), np.linalg.norm(theta)))

Test for __main__.compute_update passed!
Test for __main__.compute_baselines passed!
Iteration: 0 AverageReturn: -43.45 |theta|_2: 1.06
Iteration: 1 AverageReturn: -41.55 |theta|_2: 1.04
Iteration: 2 AverageReturn: -40.83 |theta|_2: 0.97
Iteration: 3 AverageReturn: -41.75 |theta|_2: 0.99
Iteration: 4 AverageReturn: -39.26 |theta|_2: 1.05
Iteration: 5 AverageReturn: -40.52 |theta|_2: 1.04
Iteration: 6 AverageReturn: -38.22 |theta|_2: 1.04
Iteration: 7 AverageReturn: -40.50 |theta|_2: 1.05
Iteration: 8 AverageReturn: -38.15 |theta|_2: 1.09
Iteration: 9 AverageReturn: -36.58 |theta|_2: 1.15
Iteration: 10 AverageReturn: -37.19 |theta|_2: 1.13
Iteration: 11 AverageReturn: -38.39 |theta|_2: 1.15
Iteration: 12 AverageReturn: -36.47 |theta|_2: 1.18
Iteration: 13 AverageReturn: -34.71 |theta|_2: 1.19
Iteration: 14 AverageReturn: -36.22 |theta|_2: 1.20
Iteration: 15 AverageReturn: -38.09 |theta|_2: 1.24
Iteration: 16 AverageReturn: -35.61 |theta|_2: 1.23
Iteration: 17 AverageReturn: -35.76 |thet