Skip to content

Commit

Permalink
Strengthen docstring for Adam optimizer.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 318953658
  • Loading branch information
j2i2 authored and Copybara-Service committed Jun 30, 2020
1 parent dcf806d commit 07ee4f1
Showing 1 changed file with 16 additions and 11 deletions.
27 changes: 16 additions & 11 deletions trax/optimizers/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,27 @@


class Adam(opt_base.Optimizer):
"""Adam optimizer."""
"""Adam optimizer; described in https://arxiv.org/abs/1412.6980."""

def __init__(self, learning_rate, weight_decay_rate=1e-5, # pylint: disable=useless-super-delegation
b1=0.9, b2=0.999, eps=1e-5, clip_grad_norm=None):
"""Create the Adam optimizer.
"""Creates an Adam optimizer.
Args:
learning_rate: a postitive scalar value for the initial learning rate.
weight_decay_rate: rate at which to decay weights.
b1: optional, a positive scalar value for beta_1, the exponential decay
rate for the first moment estimates (default 0.9).
b2: optional, a positive scalar value for beta_2, the exponential decay
rate for the second moment estimates (default 0.999).
eps: optional, a positive scalar value for epsilon, a small constant for
numerical stability (default 1e-5).
clip_grad_norm: the number used for gradient clipping.
learning_rate: Initial (unadapted) learning rate; original paper calls
this 'Stepsize' and suggests .001 as a generally good value.
weight_decay_rate: Fraction of prior weight values to subtract on each
step; equivalent to multiplying each weight element by
`1 - weight_decay_rate`. (This is not part of the core Adam
algorithm.)
b1: Positive scalar value for beta_1, the exponential decay rate for the
first moment estimates (default 0.9).
b2: Positive scalar value for beta_2, the exponential decay rate for the
second moment estimates (default 0.999).
eps: Positive scalar value for epsilon, a small constant for numerical
stability (default 1e-5).
clip_grad_norm: Threshold value above which gradient clipping occurs.
(This is not part of the core Adam algorithm.)
"""
super(Adam, self).__init__(
learning_rate=learning_rate,
Expand Down

0 comments on commit 07ee4f1

Please sign in to comment.