chainer · muupan · Aug 9, 2019 · Apr 6, 2019 · Apr 7, 2019 · Apr 7, 2019
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ For more information, you can refer to [ChainerRL's documentation](http://chaine
 | DQN (including DoubleDQN etc.) | ✓ | ✓ (NAF) | ✓ | x |
 | Categorical DQN | ✓ | x | ✓ | x |
 | Rainbow | ✓ | x | ✓ | x |
-| IQN | ✓ | x | x | x |
+| IQN | ✓ | x | ✓ | x |
 | DDPG | x | ✓ | ✓ | x |
 | A3C  | ✓ | ✓ | ✓ | ✓ |
 | ACER | ✓ | ✓ | ✓ | ✓ |
@@ -63,7 +63,7 @@ Following algorithms have been implemented in ChainerRL:
 - [Categorical DQN](https://arxiv.org/abs/1707.06887)
   - examples: [[atari]](examples/atari/train_categorical_dqn_ale.py) [[general gym]](examples/gym/train_categorical_dqn_gym.py)
 - [DQN (Deep Q-Network)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) (including [Double DQN](https://arxiv.org/abs/1509.06461), [Persistent Advantage Learning (PAL)](https://arxiv.org/abs/1512.04860), Double PAL, [Dynamic Policy Programming (DPP)](http://www.jmlr.org/papers/volume13/azar12a/azar12a.pdf))
-  - examples: [[atari reproduction]](examples/atari/reproduction/dqn) [[atari]](examples/atari/train_dqn_ale.py) [[atari (batched)]](examples/atari/train_dqn_batch_ale.py) [[general gym]](examples/gym/train_dqn_gym.py)
+  - examples: [[atari reproduction]](examples/atari/reproduction/dqn) [[atari]](examples/atari/train_dqn_ale.py) [[atari (batched)]](examples/atari/train_dqn_batch_ale.py) [[flickering atari]](examples/atari/train_drqn_ale.py) [[general gym]](examples/gym/train_dqn_gym.py)
 - [DDPG (Deep Deterministic Policy Gradients)](https://arxiv.org/abs/1509.02971) (including [SVG(0)](https://arxiv.org/abs/1510.09142))
   - examples: [[mujoco reproduction]](examples/mujoco/reproduction/ddpg) [[mujoco]](examples/mujoco/train_ddpg_gym.py) [[mujoco (batched)]](examples/mujoco/train_ddpg_batch_gym.py)
 - [IQN (Implicit Quantile Networks)](https://arxiv.org/abs/1806.06923)
@@ -90,6 +90,8 @@ Following useful techniques have been also implemented in ChainerRL:
   - examples: [[Rainbow]](examples/atari/reproduction/rainbow) [[DQN/DoubleDQN/PAL]](examples/atari/train_dqn_ale.py)
 - [Normalized Advantage Function](https://arxiv.org/abs/1603.00748)
   - examples: [[DQN]](examples/gym/train_dqn_gym.py) (for continuous-action envs only)
+- [Deep Recurrent Q-Network](https://arxiv.org/abs/1507.06527)
+  - examples: [[DQN]](examples/atari/train_drqn_ale.py)
 
 
 ## Visualization

diff --git a/chainerrl/agents/al.py b/chainerrl/agents/al.py
@@ -10,7 +10,6 @@
 from chainer import functions as F
 
 from chainerrl.agents import dqn
-from chainerrl.recurrent import state_kept
 
 
 class AL(dqn.DQN):
@@ -34,22 +33,32 @@ def _compute_y_and_t(self, exp_batch):
         batch_state = exp_batch['state']
         batch_size = len(exp_batch['reward'])
 
-        qout = self.q_function(batch_state)
+        if self.recurrent:
+            qout, _ = self.model.n_step_forward(
+                batch_state, exp_batch['recurrent_state'],
+                output_mode='concat')
+        else:
+            qout = self.model(batch_state)
 
         batch_actions = exp_batch['action']
 
         batch_q = qout.evaluate_actions(batch_actions)
 
         # Compute target values
+        batch_next_state = exp_batch['next_state']
 
         with chainer.no_backprop_mode():
-            target_qout = self.target_q_function(batch_state)
+            if self.recurrent:
+                target_qout, _ = self.target_model.n_step_forward(
+                    batch_state, exp_batch['recurrent_state'],
+                    output_mode='concat')
+                target_next_qout, _ = self.target_model.n_step_forward(
+                    batch_next_state, exp_batch['next_recurrent_state'],
+                    output_mode='concat')
+            else:
+                target_qout = self.target_model(batch_state)
+                target_next_qout = self.target_model(batch_next_state)
 
-            batch_next_state = exp_batch['next_state']
-
-            with state_kept(self.target_q_function):
-                target_next_qout = self.target_q_function(
-                    batch_next_state)
             next_q_max = F.reshape(target_next_qout.max, (batch_size,))
 
             batch_rewards = exp_batch['reward']
@@ -65,6 +74,3 @@ def _compute_y_and_t(self, exp_batch):
             tal_q = t_q + self.alpha * cur_advantage
 
         return batch_q, tal_q
-
-    def input_initial_batch_to_target_model(self, batch):
-        pass
diff --git a/chainerrl/agents/categorical_double_dqn.py b/chainerrl/agents/categorical_double_dqn.py
@@ -9,7 +9,6 @@
 
 from chainerrl.agents import categorical_dqn
 from chainerrl.agents.categorical_dqn import _apply_categorical_projection
-from chainerrl.recurrent import state_kept
 
 
 class CategoricalDoubleDQN(categorical_dqn.CategoricalDQN):
@@ -24,10 +23,17 @@ def _compute_target_values(self, exp_batch):
         batch_rewards = exp_batch['reward']
         batch_terminal = exp_batch['is_state_terminal']
 
-        with chainer.using_config('train', False), state_kept(self.q_function):
-            next_qout = self.q_function(batch_next_state)
-
-        target_next_qout = self.target_q_function(batch_next_state)
+        with chainer.using_config('train', False):
+            if self.recurrent:
+                target_next_qout, _ = self.target_model.n_step_forward(
+                    batch_next_state, exp_batch['next_recurrent_state'],
+                    output_mode='concat')
+                next_qout, _ = self.model.n_step_forward(
+                    batch_next_state, exp_batch['next_recurrent_state'],
+                    output_mode='concat')
+            else:
+                target_next_qout = self.target_model(batch_next_state)
+                next_qout = self.model(batch_next_state)
 
         next_q_max = target_next_qout.evaluate_actions(
             next_qout.greedy_actions)

diff --git a/chainerrl/agents/categorical_dqn.py b/chainerrl/agents/categorical_dqn.py
@@ -129,7 +129,12 @@ def _compute_target_values(self, exp_batch):
         """Compute a batch of target return distributions."""
 
         batch_next_state = exp_batch['next_state']
-        target_next_qout = self.target_model(batch_next_state)
+        if self.recurrent:
+            target_next_qout, _ = self.target_model.n_step_forward(
+                batch_next_state, exp_batch['next_recurrent_state'],
+                output_mode='concat')
+        else:
+            target_next_qout = self.target_model(batch_next_state)
 
         batch_rewards = exp_batch['reward']
         batch_terminal = exp_batch['is_state_terminal']
@@ -158,7 +163,12 @@ def _compute_y_and_t(self, exp_batch):
         batch_state = exp_batch['state']
 
         # (batch_size, n_actions, n_atoms)
-        qout = self.model(batch_state)
+        if self.recurrent:
+            qout, _ = self.model.n_step_forward(
+                batch_state, exp_batch['recurrent_state'],
+                output_mode='concat')
+        else:
+            qout = self.model(batch_state)
         n_atoms = qout.z_values.size
 
         batch_actions = exp_batch['action']

diff --git a/chainerrl/agents/double_dqn.py b/chainerrl/agents/double_dqn.py
@@ -8,7 +8,6 @@
 import chainer
 
 from chainerrl.agents import dqn
-from chainerrl.recurrent import state_kept
 
 
 class DoubleDQN(dqn.DQN):
@@ -21,10 +20,24 @@ def _compute_target_values(self, exp_batch):
 
         batch_next_state = exp_batch['next_state']
 
-        with chainer.using_config('train', False), state_kept(self.q_function):
-            next_qout = self.q_function(batch_next_state)
-
-        target_next_qout = self.target_q_function(batch_next_state)
+        with chainer.using_config('train', False):
+            if self.recurrent:
+                next_qout, _ = self.model.n_step_forward(
+                    batch_next_state,
+                    exp_batch['next_recurrent_state'],
+                    output_mode='concat',
+                )
+            else:
+                next_qout = self.model(batch_next_state)
+
+        if self.recurrent:
+            target_next_qout, _ = self.target_model.n_step_forward(
+                batch_next_state,
+                exp_batch['next_recurrent_state'],
+                output_mode='concat',
+            )
+        else:
+            target_next_qout = self.target_model(batch_next_state)
 
         next_q_max = target_next_qout.evaluate_actions(
             next_qout.greedy_actions)

diff --git a/chainerrl/agents/double_pal.py b/chainerrl/agents/double_pal.py
@@ -10,7 +10,6 @@
 from chainer import functions as F
 
 from chainerrl.agents import pal
-from chainerrl.recurrent import state_kept
 
 
 class DoublePAL(pal.PAL):
@@ -20,24 +19,35 @@ def _compute_y_and_t(self, exp_batch):
         batch_state = exp_batch['state']
         batch_size = len(exp_batch['reward'])
 
-        qout = self.q_function(batch_state)
+        if self.recurrent:
+            qout, _ = self.model.n_step_forward(
+                batch_state, exp_batch['recurrent_state'],
+                output_mode='concat')
+        else:
+            qout = self.model(batch_state)
 
         batch_actions = exp_batch['action']
         batch_q = qout.evaluate_actions(batch_actions)
 
         # Compute target values
 
         with chainer.no_backprop_mode():
-            target_qout = self.target_q_function(batch_state)
-
             batch_next_state = exp_batch['next_state']
+            if self.recurrent:
+                next_qout, _ = self.model.n_step_forward(
+                    batch_next_state, exp_batch['next_recurrent_state'],
+                    output_mode='concat')
+                target_qout, _ = self.target_model.n_step_forward(
+                    batch_state, exp_batch['recurrent_state'],
+                    output_mode='concat')
+                target_next_qout, _ = self.target_model.n_step_forward(
+                    batch_next_state, exp_batch['next_recurrent_state'],
+                    output_mode='concat')
+            else:
+                next_qout = self.model(batch_next_state)
+                target_qout = self.target_model(batch_state)
+                target_next_qout = self.target_model(batch_next_state)
 
-            with state_kept(self.q_function):
-                next_qout = self.q_function(batch_next_state)
-
-            with state_kept(self.target_q_function):
-                target_next_qout = self.target_q_function(
-                    batch_next_state)
             next_q_max = F.reshape(target_next_qout.evaluate_actions(
                 next_qout.greedy_actions), (batch_size,))
 

diff --git a/chainerrl/agents/dpp.py b/chainerrl/agents/dpp.py
@@ -30,7 +30,12 @@ def _compute_target_values(self, exp_batch):
 
         batch_next_state = exp_batch['next_state']
 
-        target_next_qout = self.target_q_function(batch_next_state)
+        if self.recurrent:
+            target_next_qout, _ = self.target_model.n_step_forward(
+                batch_next_state, exp_batch['next_recurrent_state'],
+                output_mode='concat')
+        else:
+            target_next_qout = self.target_model(batch_next_state)
         next_q_expect = self._l_operator(target_next_qout)
 
         batch_rewards = exp_batch['reward']
@@ -44,7 +49,12 @@ def _compute_y_and_t(self, exp_batch):
         batch_state = exp_batch['state']
         batch_size = len(exp_batch['reward'])
 
-        qout = self.q_function(batch_state)
+        if self.recurrent:
+            qout, _ = self.model.n_step_forward(
+                batch_state, exp_batch['recurrent_state'],
+                output_mode='concat')
+        else:
+            qout = self.model(batch_state)
 
         batch_actions = exp_batch['action']
         # Q(s_t,a_t)
@@ -53,7 +63,12 @@ def _compute_y_and_t(self, exp_batch):
 
         with chainer.no_backprop_mode():
             # Compute target values
-            target_qout = self.target_q_function(batch_state)
+            if self.recurrent:
+                target_qout, _ = self.target_model.n_step_forward(
+                    batch_state, exp_batch['recurrent_state'],
+                    output_mode='concat')
+            else:
+                target_qout = self.target_model(batch_state)
 
             # Q'(s_t,a_t)
             target_q = F.reshape(target_qout.evaluate_actions(