Skip to content

Commit

Permalink
Allow to configure gradient clipping and optimizer
Browse files Browse the repository at this point in the history
Find better hyper parameters for Reinforce
  • Loading branch information
danijar committed Oct 17, 2016
1 parent 3c68dda commit 7fe9a44
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 72 deletions.
8 changes: 8 additions & 0 deletions definition/cartpole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,12 @@ algorithms:
preprocess: default
preprocess_config: {subsample: 1, frame_skip: 1, history: 4, noop_max: 5}
initial_learning_rate: 1e-3
heads: 10
update_every: 10000
batch_size: 50
preprocess: default
preprocess_config: {subsample: 0, frame_skip: 1, history: 4, noop_max: 5}
approximation: {scale_critic_loss: 0.5, regularize: 0.01}
gradient_clipping: 1
initial_learning_rate: 1e-4
network: control
24 changes: 24 additions & 0 deletions definition/reinforce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
epochs: 20
test_steps: 1000
repeats: 1
envs:
- CartPole-v1
algorithms:
-
name: Reinforce
type: Reinforce
train_steps: 20000
config:
heads: 32
preprocess: default
preprocess_config: {subsample: 1, frame_skip: 1, history: 2, noop_max: 3}
approximation: 'advantage_policy_gradient'
approximation_config: {scale_critic_loss: 0.5, regularize: 0.01}
discount: 0.95
network: control
update_every: 10000
batch_size: 32
optimizer: AdamOptimizer
optimizer_config: {}
initial_learning_rate: 1e-3
gradient_clipping: 10
2 changes: 1 addition & 1 deletion mindpark/algorithm/a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _create_network(self, model):
network = getattr(mp.part.network, self.config.network)
observs = self._preprocess.above_task.observs.shape
actions = self._preprocess.above_task.actions.n
mp.part.approximation.value_policy_gradient(
mp.part.approximation.advantage_policy_gradient(
model, network, observs, actions, self.config.approximation)

def _create_preprocess(self):
Expand Down
2 changes: 1 addition & 1 deletion mindpark/algorithm/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def end_epoch(self):
self._model.save(self.task.directory, 'model')

def perform(self, observ):
return self._model.compute('values', state=observ)
return self._model.compute('qvalues', state=observ)

def experience(self, observ, action, reward, successor):
action = action.argmax()
Expand Down
32 changes: 22 additions & 10 deletions mindpark/algorithm/reinforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,24 @@ def defaults(cls):
preprocess = 'default'
preprocess_config = dict()
network = 'dqn_2015'
approximation = dict(scale_critic_loss=0.5, regularize=0.01)
update_every = 10000
batch_size = 32
heads = 16
discount = 0.999
initial_learning_rate = 2.5e-4
optimizer = tf.train.RMSPropOptimizer
optimizer_config = dict(decay=0.95, epsilon=0.1)
optimizer = tf.train.AdamOptimizer
gradient_clipping = 10 # 1e-2
optimizer_config = dict()
approximation = 'advantage_policy_gradient'
approximation_config = dict(scale_critic_loss=0.5, regularize=0.01)
return mp.utility.merge_dicts(super().defaults(), locals())

def __init__(self, task, config):
mp.Algorithm.__init__(self, task, config)
super().__init__(task, config)
self._parse_config()
self._preprocess = self._create_preprocess()
self.model = mp.model.Model(self._create_network)
self.model = mp.model.Model(
self._create_network, clip_delta=self.config.gradient_clipping)
print(str(self.model))
self._learning_rate = mp.utility.Decay(
self.config.initial_learning_rate, 0, self.task.steps)
Expand Down Expand Up @@ -110,11 +115,11 @@ def _create_network(self, model):
model.set_optimizer(self.config.optimizer(
learning_rate=learning_rate,
**self.config.optimizer_config))
network = getattr(mp.part.network, self.config.network)
observs = self._preprocess.above_task.observs.shape
actions = self._preprocess.above_task.actions.n
mp.part.approximation.value_policy_gradient(
model, network, observs, actions, self.config.approximation)
self.config.approximation(
model, self.config.network, observs, actions,
self.config.approximation_config)

def _create_memory(self):
observ_shape = self._preprocess.above_task.observs.shape
Expand All @@ -124,8 +129,7 @@ def _create_memory(self):
return memory

def _compute_eligibilities(self, rewards):
returns = []
return_ = 0
return_, returns = 0, []
for reward in reversed(rewards):
return_ = reward + self.config.discount * return_
returns.append(return_)
Expand All @@ -136,6 +140,14 @@ def _decay_learning_rate(self):
learning_rate = self._learning_rate(self.task.step)
self.model.set_option('learning_rate', learning_rate)

def _parse_config(self):
self.config.optimizer = getattr(
tf.train, self.config.optimizer)
self.config.network = getattr(
mp.part.network, self.config.network)
self.config.approximation = getattr(
mp.part.approximation, self.config.approximation)


class Head(mp.step.Experience):

Expand Down
5 changes: 3 additions & 2 deletions mindpark/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ class Model:
cost functions, compute gradients for costs, and apply gradients.
"""

def __init__(self, creator=None, load_path=None, threads=None):
def __init__(
self, creator=None, load_path=None, threads=None, clip_delta=10):
"""
Create a new model. Either load_path or creator must be specified.
Expand All @@ -22,7 +23,7 @@ def __init__(self, creator=None, load_path=None, threads=None):
of the model as default graph. After this function, no further
operations can be added to the graph.
"""
self._clip_delta = 10
self._clip_delta = clip_delta
self._graph = Graph(threads)
self._optimizer = None
if load_path:
Expand Down
129 changes: 93 additions & 36 deletions mindpark/part/approximation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,96 @@ def q_function(model, network, observs, actions, config=None):
"""
Action value approximation.
"""
# Percetion.
state = model.add_input('state', observs)
hidden = network(model, state)
values = dense(hidden, actions, tf.identity)
values = model.add_output('values', values)
# Training.
action = model.add_input('action', type_=tf.int32)
action = tf.one_hot(action, actions)
return_ = model.add_input('return_')
model.add_output('value', tf.reduce_max(values, 1))
model.add_cost(
'cost', (tf.reduce_sum(action * values, 1) - return_) ** 2)


def value_policy_gradient(model, network, observs, actions, config):
"""
Policy gradient with value function baseline.
"""
# perception.
state = model.add_input('state', observs)
hidden = network(model, state)
value = model.add_output(
'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
policy = dense(value, actions, tf.nn.softmax)
model.add_output(
'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
# training.
action = model.add_input('action', type_=tf.int32)
action = tf.one_hot(action, actions)
return_ = model.add_input('return_')
logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
advantage = tf.stop_gradient(return_ - value)
actor = advantage * logprob + config.regularize * entropy
critic = config.scale_critic_loss * (return_ - value) ** 2 / 2
model.add_cost('cost', critic - actor)
with tf.variable_scope('behavior'):
state = model.add_input('state', observs)
hidden = network(model, state)
qvalues = dense(hidden, actions, tf.identity)
qvalues = model.add_output('qvalues', qvalues)
with tf.variable_scope('learning'):
action = model.add_input('action', type_=tf.int32)
action = tf.one_hot(action, actions)
return_ = model.add_input('return_')
model.add_output('value', tf.reduce_max(qvalues, 1))
model.add_cost(
'cost', (tf.reduce_sum(action * qvalues, 1) - return_) ** 2)


def policy_gradient(model, network, observs, actions, config):
"""
Policy gradient of the advantage function. Estimates the advantage from a
learned value function and experiences returns.
"""
with tf.variable_scope('behavior'):
state = model.add_input('state', observs)
hidden = network(model, state)
value = model.add_output(
'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
policy = dense(value, actions, tf.nn.softmax)
model.add_output(
'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
with tf.variable_scope('learning'):
action = model.add_input('action', type_=tf.int32)
action = tf.one_hot(action, actions)
return_ = model.add_input('return_')
logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
model.add_cost('cost', return_ * logprob + config.regularize * entropy)


def advantage_policy_gradient(model, network, observs, actions, config):
"""
Policy gradient of the advantage function. Estimates the advantage from a
learned value function and experiences returns.
"""
with tf.variable_scope('behavior'):
state = model.add_input('state', observs)
hidden = network(model, state)
value = model.add_output(
'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
policy = dense(value, actions, tf.nn.softmax)
model.add_output(
'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
with tf.variable_scope('learning'):
action = model.add_input('action', type_=tf.int32)
action = tf.one_hot(action, actions)
return_ = model.add_input('return_')
logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
advantage = tf.stop_gradient(return_ - value)
actor = advantage * logprob + config.regularize * entropy
critic = config.scale_critic_loss * (return_ - value) ** 2 / 2
model.add_cost('cost', critic - actor)


def approx_advantage_policy_gradient(model, network, observs, actions, config):
"""
Policy gradient of the advantage function. Estimates the advantage from
learned value and action-value functions.
"""
with tf.variable_scope('behavior'):
state = model.add_input('state', observs)
hidden = network(model, state)
value = model.add_output(
'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
advantages = model.add_output(
'advantages', dense(hidden, actions, tf.identity))
policy = dense(value, actions, tf.nn.softmax)
model.add_output(
'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
with tf.variable_scope('learning'):
action = model.add_input('action', type_=tf.int32)
return_ = model.add_input('return_')
action = tf.one_hot(action, actions)
with tf.variable_scope('value'):
critic_v = (return_ - value) ** 2 / 2
with tf.variable_scope('advantage'):
advantage = tf.reduce_max(action * advantages, [1])
qvalue = value + advantage
critic_q = (return_ - qvalue) ** 2 / 2
with tf.variable_scope('policy'):
advantage = tf.stop_gradient(advantage)
logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
actor = advantage * logprob + config.regularize * entropy
critic = config.scale_critic_loss * (critic_v + critic_q)
model.add_cost('cost', critic - actor)
4 changes: 2 additions & 2 deletions mindpark/part/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,6 @@ def test(model, x):


def control(model, x):
x = dense(x, 32, tf.tanh)
x = dense(x, 32, tf.tanh)
x = dense(x, 100, tf.nn.relu)
x = dense(x, 50, tf.nn.relu)
return x
5 changes: 3 additions & 2 deletions mindpark/run/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __call__(self, definition):
jobs = self._create_jobs(experiment, definition)
with ThreadPoolExecutor(max_workers=self._parallel) as executor:
for job in jobs:
executor.submit(job, self._lock)
executor.submit(job)
duration = round((time.time() - start) / 3600, 1)
self._log_finish(experiment, duration)

Expand Down Expand Up @@ -71,7 +71,8 @@ def _create_job(self, experiment, env_name, algo_def, repeat, definition):
(definition.epochs + 1) * definition.test_steps,
definition.epochs + 1, False)
prefix = '{} on {} ({}):'.format(algo_def.name, env_name, repeat)
return Job(train, test, env_name, algo_def, prefix, self._videos)
return Job(
train, test, env_name, algo_def, prefix, self._videos, self._lock)

def _start_experiment(self, name):
print_headline('Start experiment', style='=')
Expand Down

0 comments on commit 7fe9a44

Please sign in to comment.