trfl/value_ops.py

# Copyright 2018 The trfl Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""TensorFlow ops for state value learning."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections

# Dependency imports
import tensorflow as tf
from trfl import base_ops
from trfl import sequence_ops


TDExtra = collections.namedtuple("td_extra", ["target", "td_error"])
TDLambdaExtra = collections.namedtuple(
    "td_lambda_extra", ["temporal_differences", "discounted_returns"])


def td_learning(v_tm1, r_t, pcont_t, v_t, name="TDLearning"):
  """Implements the TD(0)-learning loss as a TensorFlow op.

  The TD loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * v_t`.

  See "Learning to Predict by the Methods of Temporal Differences" by Sutton.
  (https://link.springer.com/article/10.1023/A:1022633531479).

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    v_t: Tensor holding values at current timestep, shape `[B]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[v_tm1, v_t, r_t, pcont_t]], [1], name)

  # TD(0)-learning op.
  with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, v_t]):

    # Build target.
    target = tf.stop_gradient(r_t + pcont_t * v_t)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - v_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, TDExtra(target, td_error))


def generalized_lambda_returns(rewards,
                               pcontinues,
                               values,
                               bootstrap_value,
                               lambda_=1,
                               name="generalized_lambda_returns"):
  """Computes lambda-returns along a batch of (chunks of) trajectories.

  For lambda=1 these will be multistep returns looking ahead from each
  state to the end of the chunk, where bootstrap_value is used. If you pass an
  entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo
  return / TD(1) target.

  For lambda=0 these are one-step TD(0) targets.

  For inbetween values of lambda these are lambda-returns / TD(lambda) targets,
  except that traces are always cut off at the end of the chunk, since we can't
  see returns beyond then. If you pass an entire trajectory with zeros for
  bootstrap_value though, then they're plain TD(lambda) targets.

  lambda can also be a tensor of values in [0, 1], determining the mix of
  bootstrapping vs further accumulation of multistep returns at each timestep.
  This can be used to implement Retrace and other algorithms. See
  `sequence_ops.multistep_forward_view` for more info on this. Another way to
  think about the end-of-chunk cutoff is that lambda is always effectively zero
  on the timestep after the end of the chunk, since at the end of the chunk we
  rely entirely on bootstrapping and can't accumulate returns looking further
  into the future.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` transitions into another state with value `V'`, receiving
  reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index
  `i` in the corresponding tensors. `V'` is at index `i+1`, or in the
  `bootstrap_value` tensor if `i == T`.

  Subtracting `values` from these lambda-returns will yield estimates of the
  advantage function which can be used for both the policy gradient loss and
  the baseline value function loss in A3C / GAE.

  Args:
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    values: 2-D Tensor containing estimates of the state values for timesteps
      0 to `T-1`. Shape `[T, B]`.
    bootstrap_value: 1-D Tensor containing an estimate of the value of the
      final state at time `T`, used for bootstrapping the target n-step
      returns. Shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    2-D Tensor with shape `[T, B]`
  """
  values.get_shape().assert_has_rank(2)
  rewards.get_shape().assert_has_rank(2)
  pcontinues.get_shape().assert_has_rank(2)
  bootstrap_value.get_shape().assert_has_rank(1)
  scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
  with tf.name_scope(name, values=scoped_values):
    if lambda_ == 1:
      # This is actually equivalent to the branch below, just an optimisation
      # to avoid unnecessary work in this case:
      return sequence_ops.scan_discounted_sum(
          rewards,
          pcontinues,
          initial_value=bootstrap_value,
          reverse=True,
          back_prop=False,
          name="multistep_returns")
    else:
      v_tp1 = tf.concat(
          axis=0, values=[values[1:, :],
                          tf.expand_dims(bootstrap_value, 0)])
      # `back_prop=False` prevents gradients flowing into values and
      # bootstrap_value, which is what you want when using the bootstrapped
      # lambda-returns in an update as targets for values.
      return sequence_ops.multistep_forward_view(
          rewards,
          pcontinues,
          v_tp1,
          lambda_,
          back_prop=False,
          name="generalized_lambda_returns")


def td_lambda(state_values,
              rewards,
              pcontinues,
              bootstrap_value,
              lambda_=1,
              name="BaselineLoss"):
  """Constructs a TensorFlow graph computing the L2 loss for sequences.

  This loss learns the baseline for advantage actor-critic models. Gradients
  for this loss flow through each tensor in `state_values`, but no other
  input tensors. The baseline is regressed towards the n-step bootstrapped
  returns given by the reward/pcontinue sequence.

  This function is designed for batches of sequences of data. Tensors are
  assumed to be time major (i.e. the outermost dimension is time, the second
  outermost dimension is the batch dimension). We denote the sequence length
  in the shapes of the arguments with the variable `T`, the batch size with
  the variable `B`, neither of which needs to be known at construction time.
  Index `0` of the time dimension is assumed to be the start of the sequence.

  `rewards` and `pcontinues` are the sequences of data taken directly from the
  environment, possibly modulated by a discount. `state_values` are the
  sequences of (typically learnt) estimates of the values of the states
  visited along a batch of trajectories.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` that takes an action transitions into another state
  with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `r`
  and `p` are all at the same index `i` in the corresponding tensors. `V'` is
  at index `i+1`, or in the `bootstrap_value` tensor if `i == T`.

  See "High-dimensional continuous control using generalized advantage
  estimation" by Schulman, Moritz, Levine et al.
  (https://arxiv.org/abs/1506.02438).

  Args:
    state_values: 2-D Tensor of state-value estimates with shape `[T, B]`.
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    bootstrap_value: 1-D Tensor with shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * temporal_differences, Tensor of shape `[T, B]`
        * discounted_returns, Tensor of shape `[T, B]`
  """
  scoped_values = [state_values, rewards, pcontinues, bootstrap_value]
  with tf.name_scope(name, values=scoped_values):
    discounted_returns = generalized_lambda_returns(
        rewards, pcontinues, state_values, bootstrap_value, lambda_)
    temporal_differences = discounted_returns - state_values
    loss = 0.5 * tf.reduce_sum(
        tf.square(temporal_differences), axis=0, name="l2_loss")

    return base_ops.LossOutput(
        loss, TDLambdaExtra(
            temporal_differences=temporal_differences,
            discounted_returns=discounted_returns))


def qv_max(v_tm1, r_t, pcont_t, q_t, name="QVMAX"):
  """Implements the QVMAX learning loss as a TensorFlow op.

  The QVMAX loss is `0.5` times the squared difference between `v_tm1` and
  the target `r_t + pcont_t * max q_t`, where `q_t` is separately learned
  through QV learning (c.f. `action_value_ops.qv_learning`).

  See "The QV Family Compared to Other Reinforcement Learning Algorithms" by
  Wiering and van Hasselt (2009).
  (http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.713.1931)

  Args:
    v_tm1: Tensor holding values at previous timestep, shape `[B]`.
    r_t: Tensor holding rewards, shape `[B]`.
    pcont_t: Tensor holding pcontinue values, shape `[B]`.
    q_t: Tensor of action values at current timestep, shape `[B, num_actions]`.
    name: name to prefix ops created by this function.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `v_tm1`, shape `[B]`.
        * `td_error`: batch of temporal difference errors, shape `[B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[v_tm1, r_t, pcont_t], [q_t]], [1, 2], name)

  # The QVMAX op.
  with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, q_t]):

    # Build target.
    target = tf.stop_gradient(r_t + pcont_t * tf.reduce_max(q_t, axis=1))

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - v_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, TDExtra(target, td_error))