Allow to configure gradient clipping and optimizer

Find better hyper parameters for Reinforce
danijar · Oct 17, 2016 · 7fe9a44 · 7fe9a44
1 parent 3c68dda
commit 7fe9a44
Show file tree

Hide file tree

Showing 10 changed files with 180 additions and 72 deletions.
diff --git a/definition/cartpole.yaml b/definition/cartpole.yaml
@@ -41,4 +41,12 @@ algorithms:
       preprocess: default
       preprocess_config: {subsample: 1, frame_skip: 1, history: 4, noop_max: 5}
       initial_learning_rate: 1e-3
+      heads: 10
+      update_every: 10000
+      batch_size: 50
+      preprocess: default
+      preprocess_config: {subsample: 0, frame_skip: 1, history: 4, noop_max: 5}
+      approximation: {scale_critic_loss: 0.5, regularize: 0.01}
+      gradient_clipping: 1
+      initial_learning_rate: 1e-4
       network: control
diff --git a/definition/reinforce.yaml b/definition/reinforce.yaml
@@ -0,0 +1,24 @@
+epochs: 20
+test_steps: 1000
+repeats: 1
+envs:
+  - CartPole-v1
+algorithms:
+  -
+    name: Reinforce
+    type: Reinforce
+    train_steps: 20000
+    config:
+      heads: 32
+      preprocess: default
+      preprocess_config: {subsample: 1, frame_skip: 1, history: 2, noop_max: 3}
+      approximation: 'advantage_policy_gradient'
+      approximation_config: {scale_critic_loss: 0.5, regularize: 0.01}
+      discount: 0.95
+      network: control
+      update_every: 10000
+      batch_size: 32
+      optimizer: AdamOptimizer
+      optimizer_config: {}
+      initial_learning_rate: 1e-3
+      gradient_clipping: 10
diff --git a/mindpark/algorithm/a3c.py b/mindpark/algorithm/a3c.py
@@ -74,7 +74,7 @@ def _create_network(self, model):
         network = getattr(mp.part.network, self.config.network)
         observs = self._preprocess.above_task.observs.shape
         actions = self._preprocess.above_task.actions.n
-        mp.part.approximation.value_policy_gradient(
+        mp.part.approximation.advantage_policy_gradient(
             model, network, observs, actions, self.config.approximation)
 
     def _create_preprocess(self):

diff --git a/mindpark/algorithm/dqn.py b/mindpark/algorithm/dqn.py
@@ -55,7 +55,7 @@ def end_epoch(self):
             self._model.save(self.task.directory, 'model')
 
     def perform(self, observ):
-        return self._model.compute('values', state=observ)
+        return self._model.compute('qvalues', state=observ)
 
     def experience(self, observ, action, reward, successor):
         action = action.argmax()

diff --git a/mindpark/algorithm/reinforce.py b/mindpark/algorithm/reinforce.py
@@ -22,19 +22,24 @@ def defaults(cls):
         preprocess = 'default'
         preprocess_config = dict()
         network = 'dqn_2015'
-        approximation = dict(scale_critic_loss=0.5, regularize=0.01)
         update_every = 10000
         batch_size = 32
         heads = 16
+        discount = 0.999
         initial_learning_rate = 2.5e-4
-        optimizer = tf.train.RMSPropOptimizer
-        optimizer_config = dict(decay=0.95, epsilon=0.1)
+        optimizer = tf.train.AdamOptimizer
+        gradient_clipping = 10  # 1e-2
+        optimizer_config = dict()
+        approximation = 'advantage_policy_gradient'
+        approximation_config = dict(scale_critic_loss=0.5, regularize=0.01)
         return mp.utility.merge_dicts(super().defaults(), locals())
 
     def __init__(self, task, config):
-        mp.Algorithm.__init__(self, task, config)
+        super().__init__(task, config)
+        self._parse_config()
         self._preprocess = self._create_preprocess()
-        self.model = mp.model.Model(self._create_network)
+        self.model = mp.model.Model(
+            self._create_network, clip_delta=self.config.gradient_clipping)
         print(str(self.model))
         self._learning_rate = mp.utility.Decay(
             self.config.initial_learning_rate, 0, self.task.steps)
@@ -110,11 +115,11 @@ def _create_network(self, model):
         model.set_optimizer(self.config.optimizer(
             learning_rate=learning_rate,
             **self.config.optimizer_config))
-        network = getattr(mp.part.network, self.config.network)
         observs = self._preprocess.above_task.observs.shape
         actions = self._preprocess.above_task.actions.n
-        mp.part.approximation.value_policy_gradient(
-            model, network, observs, actions, self.config.approximation)
+        self.config.approximation(
+            model, self.config.network, observs, actions,
+            self.config.approximation_config)
 
     def _create_memory(self):
         observ_shape = self._preprocess.above_task.observs.shape
@@ -124,8 +129,7 @@ def _create_memory(self):
         return memory
 
     def _compute_eligibilities(self, rewards):
-        returns = []
-        return_ = 0
+        return_, returns = 0, []
         for reward in reversed(rewards):
             return_ = reward + self.config.discount * return_
             returns.append(return_)
@@ -136,6 +140,14 @@ def _decay_learning_rate(self):
         learning_rate = self._learning_rate(self.task.step)
         self.model.set_option('learning_rate', learning_rate)
 
+    def _parse_config(self):
+        self.config.optimizer = getattr(
+            tf.train, self.config.optimizer)
+        self.config.network = getattr(
+            mp.part.network, self.config.network)
+        self.config.approximation = getattr(
+            mp.part.approximation, self.config.approximation)
+
 
 class Head(mp.step.Experience):
 

diff --git a/mindpark/model/model.py b/mindpark/model/model.py
@@ -11,7 +11,8 @@ class Model:
     cost functions, compute gradients for costs, and apply gradients.
     """
 
-    def __init__(self, creator=None, load_path=None, threads=None):
+    def __init__(
+            self, creator=None, load_path=None, threads=None, clip_delta=10):
         """
         Create a new model. Either load_path or creator must be specified.
 
@@ -22,7 +23,7 @@ def __init__(self, creator=None, load_path=None, threads=None):
                 of the model as default graph. After this function, no further
                 operations can be added to the graph.
         """
-        self._clip_delta = 10
+        self._clip_delta = clip_delta
         self._graph = Graph(threads)
         self._optimizer = None
         if load_path:

diff --git a/mindpark/part/approximation.py b/mindpark/part/approximation.py
@@ -6,39 +6,96 @@ def q_function(model, network, observs, actions, config=None):
     """
     Action value approximation.
     """
-    # Percetion.
-    state = model.add_input('state', observs)
-    hidden = network(model, state)
-    values = dense(hidden, actions, tf.identity)
-    values = model.add_output('values', values)
-    # Training.
-    action = model.add_input('action', type_=tf.int32)
-    action = tf.one_hot(action, actions)
-    return_ = model.add_input('return_')
-    model.add_output('value', tf.reduce_max(values, 1))
-    model.add_cost(
-        'cost', (tf.reduce_sum(action * values, 1) - return_) ** 2)
-
-
-def value_policy_gradient(model, network, observs, actions, config):
-    """
-    Policy gradient with value function baseline.
-    """
-    # perception.
-    state = model.add_input('state', observs)
-    hidden = network(model, state)
-    value = model.add_output(
-        'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
-    policy = dense(value, actions, tf.nn.softmax)
-    model.add_output(
-        'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
-    # training.
-    action = model.add_input('action', type_=tf.int32)
-    action = tf.one_hot(action, actions)
-    return_ = model.add_input('return_')
-    logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
-    entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
-    advantage = tf.stop_gradient(return_ - value)
-    actor = advantage * logprob + config.regularize * entropy
-    critic = config.scale_critic_loss * (return_ - value) ** 2 / 2
-    model.add_cost('cost', critic - actor)
+    with tf.variable_scope('behavior'):
+        state = model.add_input('state', observs)
+        hidden = network(model, state)
+        qvalues = dense(hidden, actions, tf.identity)
+        qvalues = model.add_output('qvalues', qvalues)
+    with tf.variable_scope('learning'):
+        action = model.add_input('action', type_=tf.int32)
+        action = tf.one_hot(action, actions)
+        return_ = model.add_input('return_')
+        model.add_output('value', tf.reduce_max(qvalues, 1))
+        model.add_cost(
+            'cost', (tf.reduce_sum(action * qvalues, 1) - return_) ** 2)
+
+
+def policy_gradient(model, network, observs, actions, config):
+    """
+    Policy gradient of the advantage function. Estimates the advantage from a
+    learned value function and experiences returns.
+    """
+    with tf.variable_scope('behavior'):
+        state = model.add_input('state', observs)
+        hidden = network(model, state)
+        value = model.add_output(
+            'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
+        policy = dense(value, actions, tf.nn.softmax)
+        model.add_output(
+            'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
+    with tf.variable_scope('learning'):
+        action = model.add_input('action', type_=tf.int32)
+        action = tf.one_hot(action, actions)
+        return_ = model.add_input('return_')
+        logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
+        entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
+        model.add_cost('cost', return_ * logprob + config.regularize * entropy)
+
+
+def advantage_policy_gradient(model, network, observs, actions, config):
+    """
+    Policy gradient of the advantage function. Estimates the advantage from a
+    learned value function and experiences returns.
+    """
+    with tf.variable_scope('behavior'):
+        state = model.add_input('state', observs)
+        hidden = network(model, state)
+        value = model.add_output(
+            'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
+        policy = dense(value, actions, tf.nn.softmax)
+        model.add_output(
+            'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
+    with tf.variable_scope('learning'):
+        action = model.add_input('action', type_=tf.int32)
+        action = tf.one_hot(action, actions)
+        return_ = model.add_input('return_')
+        logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
+        entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
+        advantage = tf.stop_gradient(return_ - value)
+        actor = advantage * logprob + config.regularize * entropy
+        critic = config.scale_critic_loss * (return_ - value) ** 2 / 2
+        model.add_cost('cost', critic - actor)
+
+
+def approx_advantage_policy_gradient(model, network, observs, actions, config):
+    """
+    Policy gradient of the advantage function. Estimates the advantage from
+    learned value and action-value functions.
+    """
+    with tf.variable_scope('behavior'):
+        state = model.add_input('state', observs)
+        hidden = network(model, state)
+        value = model.add_output(
+            'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
+        advantages = model.add_output(
+            'advantages', dense(hidden, actions, tf.identity))
+        policy = dense(value, actions, tf.nn.softmax)
+        model.add_output(
+            'choice', tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
+    with tf.variable_scope('learning'):
+        action = model.add_input('action', type_=tf.int32)
+        return_ = model.add_input('return_')
+        action = tf.one_hot(action, actions)
+        with tf.variable_scope('value'):
+            critic_v = (return_ - value) ** 2 / 2
+        with tf.variable_scope('advantage'):
+            advantage = tf.reduce_max(action * advantages, [1])
+            qvalue = value + advantage
+            critic_q = (return_ - qvalue) ** 2 / 2
+        with tf.variable_scope('policy'):
+            advantage = tf.stop_gradient(advantage)
+            logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
+            entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
+            actor = advantage * logprob + config.regularize * entropy
+        critic = config.scale_critic_loss * (critic_v + critic_q)
+        model.add_cost('cost', critic - actor)
diff --git a/mindpark/part/network.py b/mindpark/part/network.py
@@ -75,6 +75,6 @@ def test(model, x):
 
 
 def control(model, x):
-    x = dense(x, 32, tf.tanh)
-    x = dense(x, 32, tf.tanh)
+    x = dense(x, 100, tf.nn.relu)
+    x = dense(x, 50, tf.nn.relu)
     return x
diff --git a/mindpark/run/benchmark.py b/mindpark/run/benchmark.py
@@ -35,7 +35,7 @@ def __call__(self, definition):
         jobs = self._create_jobs(experiment, definition)
         with ThreadPoolExecutor(max_workers=self._parallel) as executor:
             for job in jobs:
-                executor.submit(job, self._lock)
+                executor.submit(job)
         duration = round((time.time() - start) / 3600, 1)
         self._log_finish(experiment, duration)
 
@@ -71,7 +71,8 @@ def _create_job(self, experiment, env_name, algo_def, repeat, definition):
             (definition.epochs + 1) * definition.test_steps,
             definition.epochs + 1, False)
         prefix = '{} on {} ({}):'.format(algo_def.name, env_name, repeat)
-        return Job(train, test, env_name, algo_def, prefix, self._videos)
+        return Job(
+            train, test, env_name, algo_def, prefix, self._videos, self._lock)
 
     def _start_experiment(self, name):
         print_headline('Start experiment', style='=')