Skip to content

Commit

Permalink
Merge d0341fc into 2f8acfe
Browse files Browse the repository at this point in the history
  • Loading branch information
uralik committed May 19, 2016
2 parents 2f8acfe + d0341fc commit 1312ed1
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 1 deletion.
77 changes: 77 additions & 0 deletions keras/optimizers.py
Expand Up @@ -415,13 +415,90 @@ def get_config(self):
return dict(list(base_config.items()) + list(config.items()))


class Nadam(Optimizer):
'''
Nesterov Adam optimizer: Adam ~ RMSProp + momentum, Nadam ~ RMSProp + NAG
Default parameters follow those provided in the paper.
# Arguments
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor.
# References
[1] Nadam report - http://cs229.stanford.edu/proj2015/054_report.pdf
[2] On the importance of initialization and momentum in deep learning -
http://www.cs.toronto.edu/~fritz/absps/momentum.pdf
'''
def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
epsilon=1e-8, **kwargs):
super(Nadam, self).__init__(**kwargs)
self.__dict__.update(locals())
self.iterations = K.variable(0.)
self.m_schedule = K.variable(1.)
self.lr = K.variable(lr)
self.beta_1 = K.variable(beta_1)
self.beta_2 = K.variable(beta_2)

def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
self.updates = [(self.iterations, self.iterations + 1)]

t = self.iterations + 1

# Due to the recommendations in [2], i.e. warming momentum schedule
schedule_decay = 0.004 # Exactly given in [1] and [2]
momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(0.96, t * schedule_decay)))
momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(0.96, (t + 1) * schedule_decay)))
m_schedule_new = self.m_schedule * momentum_cache_t
m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
self.updates.append((self.m_schedule, m_schedule_new))

ms = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
vs = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]

self.weights = ms + vs

for p, g, m, v in zip(params, grads, ms, vs):
# the following equations given in [1]
g_prime = g / (1. - m_schedule_new)
m_t = self.beta_1 * m + (1. - self.beta_1) * g
m_t_prime = m_t / (1. - m_schedule_next)
v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime

self.updates.append((m, m_t))
self.updates.append((v, v_t))

p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
new_p = p_t

# apply constraints
if p in constraints:
c = constraints[p]
new_p = c(new_p)
self.updates.append((p, new_p))
return self.updates

def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'epsilon': self.epsilon}
base_config = super(Nadam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))


# aliases
sgd = SGD
rmsprop = RMSprop
adagrad = Adagrad
adadelta = Adadelta
adam = Adam
adamax = Adamax
nadam = Nadam


def get(identifier, kwargs=None):
Expand Down
6 changes: 5 additions & 1 deletion tests/keras/test_optimizers.py
Expand Up @@ -2,7 +2,7 @@
import pytest

from keras.utils.test_utils import get_test_data
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils.np_utils import to_categorical
Expand Down Expand Up @@ -63,5 +63,9 @@ def test_adamax():
_test_optimizer(Adamax())


def test_nadam():
_test_optimizer(Nadam())


if __name__ == '__main__':
pytest.main([__file__])

0 comments on commit 1312ed1

Please sign in to comment.