Re-purposing td_k in advantages, so that it can serve also as n-step …

…returns in DQN. PiperOrigin-RevId: 332792184
google · Sep 21, 2020 · 5acd9dc · 5acd9dc
1 parent 84fe848
commit 5acd9dc
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/trax/rl/advantages.py b/trax/rl/advantages.py
@@ -49,8 +49,8 @@ def estimator(rewards, returns, values, dones):
 
 
 @gin.configurable(blacklist=common_args)
-def td_k(gamma, margin):
-  """Calculate TD-k advantage.
+def td_k(gamma, margin, n_step=False):
+  """Calculate TD-k advantage or n_step returns.
 
   The k parameter is assumed to be the same as margin.
 
@@ -64,6 +64,9 @@ def td_k(gamma, margin):
   Args:
     gamma: float, gamma parameter for TD from the underlying task
     margin: number of extra steps in the sequence
+    n_step: if set to True, then we return
+
+    gamma^n_steps * value(s_{i + n_steps}) + discounted_rewards
 
   Returns:
     Function (rewards, returns, values, dones) -> advantages, where advantages
@@ -83,7 +86,8 @@ def estimator(rewards, returns, values, dones):
     dones = dones[:, :-k]
     advantages[dones] = rewards[:, :-k][dones]
     # Subtract the baseline (value).
-    advantages -= values[:, :-k]
+    if not n_step:
+      advantages -= values[:, :-k]
     return advantages
   return estimator