-
Notifications
You must be signed in to change notification settings - Fork 0
/
ddpg_agent.py
215 lines (158 loc) · 6.76 KB
/
ddpg_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import collections
import gym
import numpy as np
import tensorflow as tf
from rltf.agents.agent import OffPolicyAgent
from rltf.memory import ReplayBuffer
class AgentDDPG(OffPolicyAgent):
def __init__(self,
model_type,
model_kwargs,
actor_opt_conf,
critic_opt_conf,
action_noise,
update_target_freq=1,
memory_size=int(1e6),
obs_hist_len=1,
**agent_kwargs
):
"""
Args:
agent_config: dict. Dictionary with parameters for the Agent class. Must
contain all parameters that do not have default values
model_type: rltf.models.Model. TF implementation of a model network
model_kwargs: dict. Model-specific keyword arguments to pass to the model
actor_opt_conf: rltf.optimizers.OptimizerConf. Config for the actor network optimizer
critic_opt_conf: rltf.optimizers.OptimizerConf. Config for the critic network optimizer
action_noise: rltf.exploration.ExplorationNoise. Action exploration noise
to add to the selected action
memory_size: int. Size of the replay buffer
obs_hist_len: int. How many environment observations comprise a single state.
"""
super().__init__(**agent_kwargs)
assert isinstance(self.env.observation_space, gym.spaces.Box)
assert isinstance(self.env.action_space, gym.spaces.Box)
assert update_target_freq % self.train_freq == 0
self.action_noise = action_noise
self.actor_opt_conf = actor_opt_conf
self.critic_opt_conf = critic_opt_conf
self.update_target_freq = update_target_freq
# Get environment specs
act_shape = list(self.env.action_space.shape)
obs_shape = list(self.env.observation_space.shape)
# Image observation
if len(obs_shape) == 3:
obs_dtype = np.uint8
obs_shape[-1] *= obs_hist_len
else:
obs_dtype = np.float32
model_kwargs["obs_shape"] = obs_shape
model_kwargs["n_actions"] = act_shape[0]
model_kwargs["actor_opt_conf"] = actor_opt_conf
model_kwargs["critic_opt_conf"] = critic_opt_conf
self.model = model_type(**model_kwargs)
self.replay_buf = ReplayBuffer(memory_size, obs_shape, obs_dtype, act_shape, np.float32, obs_hist_len)
# Configure what information to log
super()._build_log_info()
# Custom TF Tensors and Ops
self.actor_learn_rate_ph = None
self.critic_learn_rate_ph = None
# Custom stats
self.act_noise_stats = collections.deque([], maxlen=self.log_freq)
def _build(self):
# Create Learning rate placeholders
self.actor_learn_rate_ph = tf.placeholder(tf.float32, shape=(), name="actor_learn_rate_ph")
self.critic_learn_rate_ph = tf.placeholder(tf.float32, shape=(), name="critic_learn_rate_ph")
# Set the learn rate placeholders for the model
self.actor_opt_conf.lr_ph = self.actor_learn_rate_ph
self.critic_opt_conf.lr_ph = self.critic_learn_rate_ph
# Create learn rate summaries
tf.summary.scalar("actor_learn_rate", self.actor_learn_rate_ph)
tf.summary.scalar("critic_learn_rate", self.critic_learn_rate_ph)
def _restore(self, graph):
self.actor_learn_rate_ph = graph.get_tensor_by_name("actor_learn_rate_ph:0")
self.critic_learn_rate_ph = graph.get_tensor_by_name("critic_learn_rate_ph:0")
def _custom_log_info(self):
t = self.log_freq
log_info = [
( "train/actor_learn_rate", "f", self.actor_opt_conf.lr_value ),
( "train/critic_learn_rate", "f", self.critic_opt_conf.lr_value ),
( "mean/act_noise_mean (%d steps)"%t, "f", self._stats_act_noise_mean ),
( "mean/act_noise_std (%d steps)"%t, "f", self._stats_act_noise_std ),
]
return log_info
def reset(self):
self.action_noise.reset()
def _stats_act_noise_mean(self, *args):
if len(self.act_noise_stats) == 0:
return float("nan")
return np.mean(self.act_noise_stats)
def _stats_act_noise_std(self, *args):
if len(self.act_noise_stats) == 0:
return float("nan")
return np.std(self.act_noise_stats)
def _run_env(self):
last_obs = self.env.reset()
for t in range (self.start_step, self.max_steps+1):
# sess.run(t_inc_op)
# Wait until net_thread is done
self._wait_train_done()
# Store the latest obesrvation in the buffer
idx = self.replay_buf.store_frame(last_obs)
# Get an action to run
if self.learn_started:
noise = self.action_noise.sample()
state = self.replay_buf.encode_recent_obs()
action = self.model.control_action(self.sess, state)
action = action + noise
# Add action noise to stats
self.act_noise_stats.append(noise)
else:
# Choose random action when model not initialized
action = self.env.action_space.sample()
# Signal to net_thread that action is chosen
self._signal_act_chosen()
# Increement the TF timestep variable
self.sess.run(self.t_tf_inc)
# Run action
# next_obs, reward, done, info = self.env.step(action)
last_obs, reward, done, _ = self.env.step(action)
# Store the effect of the action taken upon last_obs
# self.replay_buf.store(obs, action, reward, done)
self.replay_buf.store_effect(idx, action, reward, done)
# Reset the environment if end of episode
# if done: next_obs = self.env.reset()
# obs = next_obs
if done:
last_obs = self.env.reset()
self.reset()
self._log_progress(t)
def _train_model(self):
for t in range (self.start_step, self.max_steps+1):
if (t >= self.start_train and t % self.train_freq == 0):
self.learn_started = True
# Sample the Replay Buffer
batch = self.replay_buf.sample(self.batch_size)
# Compose feed_dict
feed_dict = {
self.model.obs_t_ph: batch["obs"],
self.model.act_t_ph: batch["act"],
self.model.rew_t_ph: batch["rew"],
self.model.obs_tp1_ph: batch["obs_tp1"],
self.model.done_ph: batch["done"],
self.actor_learn_rate_ph: self.actor_opt_conf.lr_value(t),
self.critic_learn_rate_ph: self.critic_opt_conf.lr_value(t),
self.mean_ep_rew_ph: self.mean_ep_rew,
self.best_mean_ep_rew_ph: self.best_mean_ep_rew,
}
self._wait_act_chosen()
# Run a training step
self.summary, _ = self.sess.run([self.summary_op, self.model.train_op], feed_dict=feed_dict)
# Update target network
if t % self.update_target_freq == 0:
self.sess.run(self.model.update_target)
else:
self._wait_act_chosen()
if t % self.save_freq == 0:
self._save()
self._signal_train_done()