guide on how to use LSTM version of DDPG on gym environments #562

junhuang-ifast · 2019-09-25T00:51:29Z

I am trying to run DDPG with the gym Pendulum-v0 environment. However I am getting this error:

TypeError: The batch size of x must be equal to or less thanthe size of the previous state h.

This is my code:

env = gym.make('Pendulum-v0')
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

q_func = q_func_.FCLSTMSAQFunction(obs_size, n_actions, n_hidden_channels=50, n_hidden_layers=2)
pi = policy.FCLSTMDeterministicPolicy(n_input_channels=obs_size, n_hidden_channels=50, n_hidden_layers=2, 
                                      action_size=n_actions, 
                                      min_action=env.action_space.low, 
                                      max_action=env.action_space.high, 
                                      bound_action=True
                                     )
model = DDPGModel(policy=pi, q_func=q_func)
opt_a = optimizers.Adam(alpha=1e-4)
opt_c = optimizers.Adam(alpha=1e-3)
opt_a.setup(model['policy'])
opt_c.setup(model['q_function'])
opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

ou_sigma = (env.action_space.high - env.action_space.low) * 0.2
explorer = explorers.AdditiveOU(sigma=ou_sigma)

replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=5 * 10 ** 5)

phi = lambda x: x.astype(np.float32, copy=False)

agent = DDPG(model, opt_a, opt_c, replay_buffer, gamma=0.995, explorer=explorer, 
             replay_start_size=5000, target_update_method='soft', 
             target_update_interval=1, update_interval=1,
             soft_update_tau=1e-2, n_times_update=1, 
             gpu=0, minibatch_size=200, phi=phi)

n_episodes = 200
max_episode_len = 200
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
#         env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
    if i % 10 == 0:
        print('episode:', i,
              '\nR:', R,
              '\nstatistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
print('Finished.')

Here is the full initial running and error:

episode: 10
R: -1069.3354146961874
statistics: [('average_q', -0.1465160510604003), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]
episode: 20
R: -1583.6140918088897
statistics: [('average_q', -0.16802258113631832), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]

TypeError Traceback (most recent call last)
in
10 # Uncomment to watch the behaviour
11 # env.render()
---> 12 action = agent.act_and_train(obs, reward)
13 obs, reward, done, _ = env.step(action)
14 R += reward

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainerrl\agents\ddpg.py in act_and_train(self, obs, reward)
335 self.last_action = action
336
--> 337 self.replay_updater.update_if_necessary(self.t)
338
339 return self.last_action

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainerrl\replay_buffer.py in update_if_necessary(self, iteration)
543 else:
544 transitions = self.replay_buffer.sample(self.batchsize)
--> 545 self.update_func(transitions)

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainerrl\agents\ddpg.py in update(self, experiences, errors_out)
263
264 batch = batch_experiences(experiences, self.xp, self.phi, self.gamma)
--> 265 self.critic_optimizer.update(lambda: self.compute_critic_loss(batch))
266 self.actor_optimizer.update(lambda: self.compute_actor_loss(batch))
267

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainer\optimizer.py in update(self, lossfun, *args, **kwds)
862 if lossfun is not None:
863 use_cleargrads = getattr(self, '_use_cleargrads', True)
--> 864 loss = lossfun(*args, **kwds)
865 if use_cleargrads:
866 self.target.cleargrads()

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainerrl\agents\ddpg.py in ()
263
264 batch = batch_experiences(experiences, self.xp, self.phi, self.gamma)
--> 265 self.critic_optimizer.update(lambda: self.compute_critic_loss(batch))
266 self.actor_optimizer.update(lambda: self.compute_actor_loss(batch))
267

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainerrl\agents\ddpg.py in compute_critic_loss(self, batch)
208 # Estimated Q-function observes s_t and a_t
209 predict_q = F.reshape(
--> 210 self.q_function(batch_state, batch_actions),
211 (batchsize,))
212

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainerrl\q_functions\state_action_q_functions.py in call(self, x, a)
105 h = F.concat((x, a), axis=1)
106 h = self.nonlinearity(self.fc(h))
--> 107 h = self.lstm(h)
108 return self.out(h)
109

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainer\link.py in call(self, *args, **kwargs)
292 # forward is implemented in the child classes
293 forward = self.forward # type: ignore
--> 294 out = forward(*args, **kwargs)
295
296 # Call forward_postprocess hook

~\AppData\Local\Continuum\anaconda3\envs\chainer\lib\site-packages\chainer\links\connection\lstm.py in forward(self, x)
296 msg = ('The batch size of x must be equal to or less than'
297 'the size of the previous state h.')
--> 298 raise TypeError(msg)
299 elif h_size > batch:
300 h_update, h_rest = split_axis.split_axis(

TypeError: The batch size of x must be equal to or less thanthe size of the previous state h.

The text was updated successfully, but these errors were encountered:

muupan · 2019-09-26T07:53:48Z

When you use a recurrent model with DDPG, you need to

pass episodic_update=True and
use chainerrl.replay_buffers.EpisodicReplayBuffer instead of ReplayBuffer

so that it uses a batch of sequences, not a batch of transitions, for updates. You can also specify the maximum length of the sequences by episodic_update_len.

junhuang-ifast · 2019-09-26T09:42:30Z

that works thanks!

junhuang-ifast · 2019-09-27T04:06:58Z

Hi @muupan just a quick follow up on how EpisodicReplayBuffer differs from the normal ReplayBuffer (I'm trying to understand the code but it's a bit difficult for me :( )

Does EpisodicReplayBuffer accumulate each transition (S,A,S',R) in sequence and sample this sequence (without the random sampling like a traditional ReplayBuffer )?

If so, does this mean if I had a multivariate time series problem, and I wanted to use LSTM in the networks, I could input the states as shape (1, N) where (sequence_length, number_of_features), or simply (N,) if sequence_length=1 anyway. So EpisodicReplayBuffer would automatically store it and accumulate the next states', where this accumulation would be limited based on episodic_update_len?

I ask this because in traditional sequential deep learning (without RL), a time series input to a LSTM will have sequence_length=5 for example (and 5 would equal episodic_update_len for this case from my understanding), if I wanted to consider the past 5 steps.

Maybe besides explanation, you could point me to a paper on EpisodicReplayBuffer or to this code where this functionality differs from ReplayBuffer ?

Any help would be much appreciated :)

muupan · 2019-09-27T06:14:54Z

Does EpisodicReplayBuffer accumulate each transition (S,A,S',R) in sequence and sample this sequence (without the random sampling like a traditional ReplayBuffer )?

EpisodicReplayBuffer has a FIFO queue holding lists of transitions. Each list corresponds to an episode. DDPG uniformly samples lists from it via the sample_episodes method. https://github.com/chainer/chainerrl/blob/master/chainerrl/replay_buffers/episodic.py#L41

If so, does this mean if I had a multivariate time series problem, and I wanted to use LSTM in the networks, I could input the states as shape (1, N) where (sequence_length, number_of_features), or simply (N,) if sequence_length=1 anyway.

You don't need to change the shape of a state. If the shape of a state is (N,), DDPG internally concatenates states to make an input batch of states (minibatch_size, N) and feed it to the network. If episodic_update=True, this is repeated episodic_update_len times sequentially using sampled sequences.

junhuang-ifast · 2019-09-27T06:43:26Z

@muupan thanks for the quick reply. To confirm my understanding, for an env with 2 observation features X and Y, if one episode has 3 time steps, and we run for 2 episodes, EpisodicReplayBuffer will store the states as follows:

(X1, Y1)
(X2, Y2)
(X3, Y3)

(X4, Y4)
(X5, Y5)
(X6, Y6)

in FIFO order.

And during learning, when sampling, say sample_episodes(n_episodes=1, max_len=2) :
First Sample:
(X1, Y1)
(X2, Y2)
Second Sample:
(X4, Y4)
(X5, Y5)

and just to highlight, as max_len=2 and doesn't sample all the transitions in the first list, the 2 samples taken won't be random (ie. sampled in FIFO order)

muupan · 2019-09-27T07:11:17Z

the 2 samples taken won't be random (ie. sampled in FIFO order)

EpisodicReplayBuffer is FIFO in a sense that oldest episode in the buffer is discarded when the it hits the capacity limit. Sampling from it is random, not FIFO.

The content of the two samples can be random, i.e., they can be as below.
First Sample:
(X4, Y4)
(X5, Y5)
Second Sample:
(X2, Y2)
(X3, Y3)
The way of choosing a subsequence from each episode is random. https://github.com/chainer/chainerrl/blob/master/chainerrl/replay_buffer.py#L136

It is still guaranteed that one sample is taken from the first episode and the other from the second episode when it has only two episodes.

junhuang-ifast · 2019-09-27T07:33:17Z

@muupan

def random_subseq(seq, subseq_len):
    if len(seq) <= subseq_len:
        return seq
    else:
        i = np.random.randint(0, len(seq) - subseq_len + 1)
        return seq[i:i + subseq_len]

although it picks the sub-sequence to sample is at random, looking at this line return seq[i:i + subseq_len], the resulting sampled sub-sequence is in order (ie order at which the environment returned the states at each time step)

That means when sampling a sub-sequence, in say episode 2 (for the example above) the possible combinations are:

(X4, Y4)
(X5, Y5)

and

(X5, Y5)
(X6, Y6)

ONLY

and in no situation would this
(X4, Y4)
(X6, Y6)
occur?

muupan · 2019-09-27T07:35:52Z

Correct, it is always in order.

junhuang-ifast · 2019-09-27T07:36:43Z

Thanks @muupan you helped me a lot! Really appreciated your time ! :)

junhuang-ifast · 2019-09-27T09:27:32Z

Hi @muupan , may I ask where sample_episodes function is used anywhere in the code (for DDPG)? I can't seem to find it.

chainerrl/chainerrl/replay_buffers/episodic.py

Line 41 in 1f30158

def sample_episodes(self, n_episodes, max_len=None):

muupan self-assigned this Sep 26, 2019

junhuang-ifast closed this as completed Sep 26, 2019

junhuang-ifast reopened this Sep 27, 2019

junhuang-ifast closed this as completed Sep 27, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

guide on how to use LSTM version of DDPG on gym environments #562

guide on how to use LSTM version of DDPG on gym environments #562

junhuang-ifast commented Sep 25, 2019

episode: 10
R: -1069.3354146961874
statistics: [('average_q', -0.1465160510604003), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]
episode: 20
R: -1583.6140918088897
statistics: [('average_q', -0.16802258113631832), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]

muupan commented Sep 26, 2019

junhuang-ifast commented Sep 26, 2019

junhuang-ifast commented Sep 27, 2019 •

edited

muupan commented Sep 27, 2019 •

edited

junhuang-ifast commented Sep 27, 2019 •

edited

muupan commented Sep 27, 2019 •

edited

junhuang-ifast commented Sep 27, 2019 •

edited

muupan commented Sep 27, 2019

junhuang-ifast commented Sep 27, 2019

junhuang-ifast commented Sep 27, 2019 •

edited

guide on how to use LSTM version of DDPG on gym environments #562

guide on how to use LSTM version of DDPG on gym environments #562

Comments

junhuang-ifast commented Sep 25, 2019

episode: 10 R: -1069.3354146961874 statistics: [('average_q', -0.1465160510604003), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)] episode: 20 R: -1583.6140918088897 statistics: [('average_q', -0.16802258113631832), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]

muupan commented Sep 26, 2019

junhuang-ifast commented Sep 26, 2019

junhuang-ifast commented Sep 27, 2019 • edited

muupan commented Sep 27, 2019 • edited

junhuang-ifast commented Sep 27, 2019 • edited

muupan commented Sep 27, 2019 • edited

junhuang-ifast commented Sep 27, 2019 • edited

muupan commented Sep 27, 2019

junhuang-ifast commented Sep 27, 2019

junhuang-ifast commented Sep 27, 2019 • edited

episode: 10
R: -1069.3354146961874
statistics: [('average_q', -0.1465160510604003), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]
episode: 20
R: -1583.6140918088897
statistics: [('average_q', -0.16802258113631832), ('average_actor_loss', 0.0), ('average_critic_loss', 0.0)]

junhuang-ifast commented Sep 27, 2019 •

edited

muupan commented Sep 27, 2019 •

edited

junhuang-ifast commented Sep 27, 2019 •

edited

muupan commented Sep 27, 2019 •

edited

junhuang-ifast commented Sep 27, 2019 •

edited

junhuang-ifast commented Sep 27, 2019 •

edited