Merge pull request #1580 from linmx0130/master

add AdaGrad optimizer in python, a revised version
apache · Mar 9, 2016 · 32375b8 · 32375b8
2 parents f91b2e5 + e20de1a
commit 32375b8
Showing 1 changed file with 36 additions and 0 deletions.
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
@@ -546,6 +546,42 @@ def update(self, index, weight, grad, state):
         mean[:] = mean_t
         variance[:] = variance_t
 
+@register
+class AdaGrad(Optimizer):
+    """AdaGrad optimizer of Duchi et al., 2011,
+
+    This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
+    by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
+    in some cases.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        Step size.
+        Default value is set to 0.05.
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+    rescale_grad : float, optional
+        rescaling factor of gradient.
+    eps: float, optional
+        A small float number to make the updating processing stable
+        Default value is set to 1e-7.
+    """
+    def __init__(self, learning_rate=0.05, wd=0., rescale_grad=1, eps=1e-7, arg_names=None):
+        super(AdaGrad, self).__init__(rescale_grad, arg_names, wd)
+        self.lr = learning_rate
+        self.float_stable_eps = eps
+        self.rescale_grad = rescale_grad
+    def create_state(self, index, weight):
+        return zeros(weight.shape, weight.context)   #history
+    def update(self, index, weight, grad, state):
+        assert(isinstance(weight, NDArray))
+        assert(isinstance(grad, NDArray))
+        grad = grad * self.rescale_grad
+        history = state
+        history[:] += (grad * grad)
+        weight[:] += -self.lr * (grad / sqrt(history + self.float_stable_eps) + self.wd * weight)
+
 @register
 class RMSProp(Optimizer):
     """RMSProp optimizer of Tieleman & Hinton, 2012,