-
Notifications
You must be signed in to change notification settings - Fork 230
/
update.jl
608 lines (486 loc) · 22.5 KB
/
update.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
export update!, clone, optimizers, SGD, Sgd, sgd, sgd!, Momentum, momentum, momentum!, Nesterov, nesterov, nesterov!, Adagrad, adagrad, adagrad!, Rmsprop, rmsprop, rmsprop!, Adadelta, adadelta, adadelta!, Adam, adam, adam!
using LinearAlgebra: norm, lmul!, axpy!
using AutoGrad: Param, full
# These types define per-parameter config and state for various optimization algorithms. The
# lowercase functions create an optimization iterator and the bang versions run the iterator
# both calling minimize. Minimize takes the given type as a global default and attaches a copy
# of it to any parameter's Param.opt if not already set. One can override this default by
# pre-setting Param.opt of a parameter, in which case it will not be overriden by minimize.
# TODO: handle common tasks like gclip and decay in minimize?
# TODO: use common/consistent keyword arg names.
# https://jlmelville.github.io/mize/nesterov.html
"""
minimize(func, data, optimizer=Adam(); params)
sgd (func, data; lr=0.1, gclip, params)
momentum(func, data; lr=0.05, gamma=0.95, gclip, params)
nesterov(func, data; lr=0.05, gamma=0.95, gclip, params)
adagrad (func, data; lr=0.05, eps=1e-6, gclip, params)
rmsprop (func, data; lr=0.01, rho=0.9, eps=1e-6, gclip, params)
adadelta(func, data; lr=1.0, rho=0.9, eps=1e-6, gclip, params)
adam (func, data; lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, gclip, params)
Return an iterator which applies `func` to arguments in `data`, i.e. `(func(args...) for
args in data)`, and updates the parameters every iteration to minimize `func`. `func`
should return a scalar value.
The common keyword argument `params` can be used to list the `Param`s to be optimized. If
not specified, any `Param` that takes part in the computation of `func(args...)` will be
updated.
The common keyword argument `gclip` can be used to implement per-parameter gradient
clipping. For a parameter gradient `g`, if `norm(g) > gclip > 0`, `g` is scaled so that its
norm is equal to `gclip`. If not specified no gradient clipping is performed.
These functions do not perform optimization, but return an iterator that can. Any function
that produces values from an iterator can be used with such an object, e.g.
`progress!(sgd(f,d))` iterates the sgd optimizer and displays a progress bar. For convenience,
appending `!` to the name of the function iterates and returns `nothing`, i.e. `sgd!(...)` is
equivalent to `(for x in sgd(...) end)`.
We define optimizers as lazy iterators to have explicit control over them:
* To report progress use `progress(sgd(f,d))`.
* To run until convergence use `converge(sgd(f,cycle(d)))`.
* To run multiple epochs use `sgd(f,repeat(d,n))`.
* To run a given number of iterations use `sgd(f,take(cycle(d),n))`.
* To do a task every n iterations use `(task() for (i,j) in enumerate(sgd(f,d)) if i%n == 1)`.
These functions apply the same algorithm with the same configuration to every parameter by
default. `minimize` takes an explicit optimizer argument, all others call `minimize` with an
appropriate optimizer argument (see `@doc update!` for a list of possible optimizers). Before
calling [`update!`](@ref) on a `Param`, `minimize` sets its `opt` field to a copy of this
default optimizer if it is not already set. The `opt` field is used by the `update!` function
to determine the type of update performed on that parameter. If you need finer grained
control, you can set the optimizer of an individual `Param` by setting its `opt` field before
calling one of these functions. They will not override the `opt` field if it is already set,
e.g. `sgd(model,data)` will perform an `Adam` update for a parameter whose `opt` field is an
`Adam` object. This also means you can stop and start the training without losing optimization
state, the first call will set the `opt` fields and the subsequent calls will not override
them.
Given a parameter `w` and its gradient `g` here are the updates applied by each optimizer:
# sgd (http://en.wikipedia.org/wiki/Stochastic_gradient_descent)
w .= w - lr * g
# momentum (http://jlmelville.github.io/mize/nesterov.html)
v .= gamma * v - lr * g
w .= w + v
# nesterov (http://jlmelville.github.io/mize/nesterov.html)
w .= w - gamma * v
v .= gamma * v - lr * g
w .= w + (1 + gamma) * v
# adagrad (http://www.jmlr.org/papers/v12/duchi11a.html)
G .= G + g .^ 2
w .= w - lr * g ./ sqrt(G + eps)
# rmsprop (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
G .= rho * G + (1-rho) * g .^ 2
w .= w - lr * g ./ sqrt(G + eps)
# adadelta (http://arxiv.org/abs/1212.5701)
G .= rho * G + (1-rho) * g .^ 2
update = sqrt(delta + eps) .* g ./ sqrt(G + eps)
w = w - lr * update
delta = rho * delta + (1-rho) * update .^ 2
# adam (http://arxiv.org/abs/1412.6980)
v = beta1 * v + (1 - beta1) * g
G = beta2 * G + (1 - beta2) * g .^ 2
vhat = v ./ (1 - beta1 ^ t)
Ghat = G ./ (1 - beta2 ^ t)
w = w - (lr / (sqrt(Ghat) + eps)) * vhat
"""
minimize, minimize!, sgd, sgd!, momentum, momentum!, nesterov, nesterov!, adagrad, adagrad!, rmsprop, rmsprop!, adadelta, adadelta!, adam, adam!
"""
SGD(;lr=0.1,gclip=0)
update!(w,g,p::SGD)
update!(w,g;lr=0.1)
Container for parameters of the Stochastic gradient descent (SGD)
optimization algorithm used by [`update!`](@ref).
SGD is an optimization technique to minimize an objective function by
updating its weights in the opposite direction of their gradient. The
learning rate (lr) determines the size of the step. SGD updates the
weights with the following formula:
w = w - lr * g
where `w` is a weight array, `g` is the gradient of the loss function
w.r.t `w` and `lr` is the learning rate.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip==0` no scaling takes place.
SGD is used by default if no algorithm is specified in the two
argument version of `update!`[@ref].
"""
mutable struct SGD
lr::AbstractFloat
gclip::AbstractFloat # TODO: should gclip, decay etc be global?
end
const SGDLR = 0.1
SGD(; lr=SGDLR, gclip=0) = SGD(lr,gclip)
sgd(f,d;lr=SGDLR, gclip=0, o...)=minimize(f,d,SGD(lr,gclip);o...)
sgd!(x...;o...)=for y in sgd(x...;o...); end
clone(s::SGD)=SGD(s.lr,s.gclip)
function Sgd(x...;o...)
@warn "Sgd is deprecated, use SGD instead." maxlog=1
SGD(x...; o...)
end
"""
Momentum(;lr=0.05, gclip=0, gamma=0.95)
update!(w,g,p::Momentum)
Container for parameters of the Momentum optimization algorithm used
by [`update!`](@ref).
The Momentum method tries to accelerate SGD by adding a velocity term
to the update. This also decreases the oscillation between successive
steps. It updates the weights with the following formulas:
velocity = gamma * velocity + lr * g
w = w - velocity
where `w` is a weight array, `g` is the gradient of the objective
function w.r.t `w`, `lr` is the learning rate, `gamma` is the momentum
parameter, `velocity` is an array with the same size and type of `w`
and holds the accelerated gradients.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip==0` no scaling takes place.
Reference: [Qian,
N. (1999)](http://doi.org/10.1016/S0893-6080(98)00116-6). On the
momentum term in gradient descent learning algorithms. Neural
Networks : The Official Journal of the International Neural Network
Society, 12(1), 145–151.
"""
mutable struct Momentum
lr::AbstractFloat
gamma::AbstractFloat
gclip::AbstractFloat
velocity
end
Momentum(; lr=0.05, gclip=0, gamma=0.95)=Momentum(lr, gamma, gclip, nothing)
momentum(f,d;lr=0.05,gclip=0,gamma=0.95,o...)=minimize(f,d,Momentum(lr,gamma,gclip,nothing);o...)
momentum!(x...;o...)=for y in momentum(x...;o...); end
clone(m::Momentum)=Momentum(m.lr,m.gamma,m.gclip,nothing)
"""
Nesterov(; lr=0.05, gclip=0, gamma=0.95)
update!(w,g,p::Momentum)
Container for parameters of Nesterov's momentum optimization algorithm used
by [`update!`](@ref).
It is similar to standard [`Momentum`](@ref) but with a slightly different update
rule:
velocity = gamma * velocity_old - lr * g
w = w_old - velocity_old + (1+gamma) * velocity
where `w` is a weight array, `g` is the gradient of the objective
function w.r.t `w`, `lr` is the learning rate, `gamma` is the momentum
parameter, `velocity` is an array with the same size and type of `w`
and holds the accelerated gradients.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip == 0` no scaling takes place.
Reference Implementation : [Yoshua Bengio, Nicolas Boulanger-Lewandowski and Razvan P
ascanu](https://arxiv.org/pdf/1212.0901.pdf)
"""
mutable struct Nesterov
lr::AbstractFloat
gamma::AbstractFloat
gclip::AbstractFloat
velocity
end
Nesterov(; lr=0.05, gclip=0, gamma=0.95) = Nesterov(lr, gamma, gclip, nothing)
nesterov(f,d;lr=0.05,gclip=0,gamma=0.95,o...)=minimize(f,d,Nesterov(lr,gamma,gclip,nothing);o...)
nesterov!(x...;o...)=for y in nesterov(x...;o...); end
clone(m::Nesterov)=Nesterov(m.lr,m.gamma,m.gclip,nothing)
"""
Adagrad(;lr=0.05, gclip=0, eps=1e-6)
update!(w,g,p::Adagrad)
Container for parameters of the Adagrad optimization algorithm used by
[`update!`](@ref).
Adagrad is one of the methods that adapts the learning rate to each of
the weights. It stores the sum of the squares of the gradients to
scale the learning rate. The learning rate is adapted for each weight
by the value of current gradient divided by the accumulated
gradients. Hence, the learning rate is greater for the parameters
where the accumulated gradients are small and the learning rate is
small if the accumulated gradients are large. It updates the weights
with the following formulas:
G = G + g .^ 2
w = w - g .* lr ./ sqrt(G + eps)
where `w` is the weight, `g` is the gradient of the objective function
w.r.t `w`, `lr` is the learning rate, `G` is an array with the same
size and type of `w` and holds the sum of the squares of the
gradients. `eps` is a small constant to prevent a zero value in the
denominator.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip==0` no scaling takes place.
Reference: [Duchi, J., Hazan, E., & Singer,
Y. (2011)](http://jmlr.org/papers/v12/duchi11a.html). Adaptive
Subgradient Methods for Online Learning and Stochastic Optimization.
Journal of Machine Learning Research, 12, 2121–2159.
"""
mutable struct Adagrad
lr::AbstractFloat
eps::AbstractFloat
gclip::AbstractFloat
G
end
Adagrad(; lr=0.05, gclip=0, eps=1e-6)=Adagrad(lr, eps, gclip, nothing)
adagrad(f,d;lr=0.05,gclip=0,eps=1e-6,o...)=minimize(f,d,Adagrad(lr,eps,gclip,nothing);o...)
adagrad!(x...;o...)=for y in adagrad(x...;o...); end
clone(a::Adagrad)=Adagrad(a.lr,a.eps,a.gclip,nothing)
"""
Rmsprop(;lr=0.01, gclip=0, rho=0.9, eps=1e-6)
update!(w,g,p::Rmsprop)
Container for parameters of the Rmsprop optimization algorithm used by
[`update!`](@ref).
Rmsprop scales the learning rates by dividing the root mean squared of
the gradients. It updates the weights with the following formula:
G = (1-rho) * g .^ 2 + rho * G
w = w - lr * g ./ sqrt(G + eps)
where `w` is the weight, `g` is the gradient of the objective function
w.r.t `w`, `lr` is the learning rate, `G` is an array with the same
size and type of `w` and holds the sum of the squares of the
gradients. `eps` is a small constant to prevent a zero value in the
denominator. `rho` is the momentum parameter and `delta` is an array
with the same size and type of `w` and holds the sum of the squared
updates.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip==0` no scaling takes place.
Reference: [Tijmen Tieleman and Geoffrey Hinton
(2012)](https://dirtysalt.github.io/images/nn-class-lec6.pdf). "Lecture
6.5-rmsprop: Divide the gradient by a running average of its recent
magnitude." COURSERA: Neural Networks for Machine Learning 4.2.
"""
mutable struct Rmsprop
lr::AbstractFloat
rho::AbstractFloat
eps::AbstractFloat
gclip::AbstractFloat
G
end
Rmsprop(; lr=0.01, gclip=0, rho=0.9, eps=1e-6)=Rmsprop(lr, rho, eps, gclip, nothing)
rmsprop(f,d;lr=0.01,gclip=0,rho=0.9,eps=1e-6,o...)=minimize(f,d,Rmsprop(lr,rho,eps,gclip,nothing);o...)
rmsprop!(x...;o...)=for y in rmsprop(x...;o...); end
clone(r::Rmsprop)=Rmsprop(r.lr,r.rho,r.eps,r.gclip,nothing)
"""
Adadelta(;lr=1.0, gclip=0, rho=0.9, eps=1e-6)
update!(w,g,p::Adadelta)
Container for parameters of the Adadelta optimization algorithm used by
[`update!`](@ref).
Adadelta is an extension of Adagrad that tries to prevent the decrease
of the learning rates to zero as training progresses. It scales the
learning rate based on the accumulated gradients like Adagrad and
holds the acceleration term like Momentum. It updates the weights with
the following formulas:
G = (1-rho) * g .^ 2 + rho * G
update = g .* sqrt(delta + eps) ./ sqrt(G + eps)
w = w - lr * update
delta = rho * delta + (1-rho) * update .^ 2
where `w` is the weight, `g` is the gradient of the objective function
w.r.t `w`, `lr` is the learning rate, `G` is an array with the same
size and type of `w` and holds the sum of the squares of the
gradients. `eps` is a small constant to prevent a zero value in the
denominator. `rho` is the momentum parameter and `delta` is an array
with the same size and type of `w` and holds the sum of the squared
updates.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip==0` no scaling takes place.
Reference: [Zeiler,
M. D. (2012)](http://arxiv.org/abs/1212.5701). ADADELTA: An Adaptive
Learning Rate Method.
"""
mutable struct Adadelta
lr::AbstractFloat
rho::AbstractFloat
eps::AbstractFloat
gclip::AbstractFloat
G
delta
end
Adadelta(; lr=1.0, gclip=0, rho=0.9, eps=1e-6)=Adadelta(lr, rho, eps, gclip, nothing, nothing)
adadelta(f,d;lr=1.0,gclip=0,rho=0.9,eps=1e-6,o...)=minimize(f,d,Adadelta(lr,rho,eps,gclip,nothing,nothing);o...)
adadelta!(x...;o...)=for y in adadelta(x...;o...); end
clone(a::Adadelta)=Adadelta(a.lr,a.rho,a.eps,a.gclip,nothing,nothing)
"""
Adam(;lr=0.001, gclip=0, beta1=0.9, beta2=0.999, eps=1e-8)
update!(w,g,p::Adam)
Container for parameters of the Adam optimization algorithm used by
[`update!`](@ref).
Adam is one of the methods that compute the adaptive learning rate. It
stores accumulated gradients (first moment) and the sum of the squared
of gradients (second). It scales the first and second moment as a
function of time. Here is the update formulas:
m = beta1 * m + (1 - beta1) * g
v = beta2 * v + (1 - beta2) * g .* g
mhat = m ./ (1 - beta1 ^ t)
vhat = v ./ (1 - beta2 ^ t)
w = w - (lr / (sqrt(vhat) + eps)) * mhat
where `w` is the weight, `g` is the gradient of the objective function
w.r.t `w`, `lr` is the learning rate, `m` is an array with the same
size and type of `w` and holds the accumulated gradients. `v` is an
array with the same size and type of `w` and holds the sum of the
squares of the gradients. `eps` is a small constant to prevent a zero
denominator. `beta1` and `beta2` are the parameters to calculate bias
corrected first and second moments. `t` is the update count.
If `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal
to `gclip`. If `gclip==0` no scaling takes place.
Reference: [Kingma, D. P., & Ba,
J. L. (2015)](https://arxiv.org/abs/1412.6980). Adam: a Method for
Stochastic Optimization. International Conference on Learning
Representations, 1–13.
"""
mutable struct Adam
lr::AbstractFloat
beta1::AbstractFloat
beta2::AbstractFloat
eps::AbstractFloat
t::Int
gclip::AbstractFloat
fstm
scndm
end
Adam(; lr=0.001, gclip=0, beta1=0.9, beta2=0.999, eps=1e-8)=Adam(lr, beta1, beta2, eps, 0, gclip, nothing, nothing)
adam(f,d;lr=0.001,gclip=0,beta1=0.9,beta2=0.999,eps=1e-8,o...)=minimize(f,d,Adam(lr,beta1,beta2,eps,0,gclip,nothing,nothing);o...)
adam!(x...;o...)=for y in adam(x...;o...); end
clone(a::Adam)=Adam(a.lr,a.beta1,a.beta2,a.eps,0,a.gclip,nothing,nothing)
"""
update!(weights::Param, gradients)
update!(weights, gradients; lr=0.1, gclip=0)
update!(weights, gradients, optimizers)
Update the `weights` using their `gradients` and the optimization algorithms specified using
(1) the `opt` field of a `Param`, (2) keyword arguments, (3) the third argument.
`weights` can be an individual `Param`, numeric array, or a collection of arrays/Params
represented by an iterator or dictionary. `gradients` should be a matching individual array or
collection. In the first form, the optimizer should be specified in `weights.opt`. In the
second form the optimizer defaults to [`SGD`](@ref) with learning rate `lr` and gradient clip
`gclip`. In the third form `optimizers` should be a matching individual optimizer or
collection of optimizers. The `weights` and possibly `gradients` and `optimizers` are
modified in-place.
Individual optimization parameters can be one of the following types. The keyword arguments
for each constructor and their default values are listed as well.
* [`SGD`](@ref)`(;lr=0.1, gclip=0)`
* [`Momentum`](@ref)`(;lr=0.05, gamma=0.95, gclip=0)`
* [`Nesterov`](@ref)`(;lr=0.05, gamma=0.95, gclip=0)`
* [`Adagrad`](@ref)`(;lr=0.05, eps=1e-6, gclip=0)`
* [`Rmsprop`](@ref)`(;lr=0.01, rho=0.9, eps=1e-6, gclip=0)`
* [`Adadelta`](@ref)`(;lr=1.0, rho=0.9, eps=1e-6, gclip=0)`
* [`Adam`](@ref)`(;lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, gclip=0)`
# Example:
w = Param(rand(d), Adam()) # a Param with a specified optimizer
g = lossgradient0(w) # gradient g has the same shape as w
update!(w, g) # update w in-place with Adam()
w = rand(d) # an individual weight array
g = lossgradient1(w) # gradient g has the same shape as w
update!(w, g) # update w in-place with SGD()
update!(w, g; lr=0.1) # update w in-place with SGD(lr=0.1)
update!(w, g, SGD(lr=0.1)) # update w in-place with SGD(lr=0.1)
w = (rand(d1), rand(d2)) # a tuple of weight arrays
g = lossgradient2(w) # g will also be a tuple
p = (Adam(), SGD()) # p has optimizers for each w[i]
update!(w, g, p) # update each w[i] in-place with g[i],p[i]
w = Any[rand(d1), rand(d2)] # any iterator can be used
g = lossgradient3(w) # g will be similar to w
p = Any[Adam(), SGD()] # p should be an iterator of same length
update!(w, g, p) # update each w[i] in-place with g[i],p[i]
w = Dict(:a => rand(d1), :b => rand(d2)) # dictionaries can be used
g = lossgradient4(w)
p = Dict(:a => Adam(), :b => SGD())
update!(w, g, p)
"""
update!(x::Param, g) = (x.opt === nothing ? update!(x.value, g) : update!(x.value, g, x.opt))
update!(w::Param, g::Nothing)=w # AutoGrad may return Nothing for a zero gradient
# Two arg version defaults to SGD
update!(w, g; lr=SGDLR, gclip=0) = update!(w, g, SGD(lr, gclip))
update!(w, g::Nothing; o...)=w
# This fallback takes care of arrays, tuples, iterators in general.
function update!(w,g,p)
if isbitstype(eltype(w))
error("Bad args: $((typeof(w),typeof(g),typeof(p)))")
end
if p isa SGD # This comes from the 2-arg version
p1 = p; p = (p1 for wi in w)
end
if !(length(w)==length(g)==length(p))
error("weight, gradient, and optimization parameters not the same length.")
end
for (wi,gi,pi) in zip(w,g,p)
update!(wi,gi,pi)
end
end
update!(w, g::Nothing, p)=w
# We still need an extra method for Dict.
function update!(w::AbstractDict,g::AbstractDict,p::AbstractDict)
for k in keys(g) # g may have fewer keys than w
update!(w[k],g[k],p[k])
end
end
# Generic three arg version for float arrays
# Fix #579: leave g untyped, it can be Sparse.
update!(w::AbstractArray{<:Number}, g, p) = gclip_update!(w, g, p)
function gclip_update!(w, g, p)
gclip!(g, p.gclip) # gclip! supports AutoGrad.Sparse
g = full(g)
_update!(w, g, p)
end
function _update!(w, g, p::SGD)
axpy!(-p.lr, g, w)
end
function _update!(w, g, p::Momentum)
if p.velocity===nothing; p.velocity=zero(w); end
lmul!(p.gamma, p.velocity)
axpy!(-p.lr, g, p.velocity)
axpy!(1, p.velocity, w)
end
# https://arxiv.org/pdf/1212.0901.pdf Eq. (7)
function _update!(w, g, p::Nesterov)
p.velocity ===nothing && (p.velocity = zero(w))
lmul!(p.gamma, p.velocity)
axpy!(-1, p.velocity, w)
axpy!(-p.lr, g, p.velocity)
axpy!(1+p.gamma, p.velocity, w)
end
function _update!(w, g, p::Adagrad)
T = eltype(w)
if p.G===nothing; p.G=zero(w); end
axpy!(1, g .* g, p.G)
axpy!(-p.lr, g ./ sqrt.(p.G .+ T(p.eps)), w)
end
function _update!(w, g, p::Rmsprop)
T = eltype(w)
if p.G===nothing; p.G=zero(w); end
lmul!(p.rho, p.G)
axpy!(1-p.rho, g .* g, p.G)
axpy!(-p.lr, g ./ sqrt.(p.G .+ T(p.eps)), w)
end
function _update!(w, g, p::Adadelta)
T = eltype(w)
if p.G===nothing; p.G=zero(w); p.delta=zero(w); end
lmul!(p.rho, p.G)
axpy!(1-p.rho, g .* g, p.G)
dw = g .* sqrt.(p.delta .+ T(p.eps)) ./ sqrt.(p.G .+ T(p.eps))
lmul!(p.rho, p.delta)
axpy!(1-p.rho, dw .* dw , p.delta)
axpy!(-p.lr, dw, w)
end
function _update!(w, g, p::Adam)
T = eltype(w)
if p.fstm===nothing; p.fstm=zero(w); p.scndm=zero(w); end
p.t += 1
lmul!(p.beta1, p.fstm)
axpy!(1-p.beta1, g, p.fstm)
lmul!(p.beta2, p.scndm)
axpy!(1-p.beta2, g .* g, p.scndm)
fstm_corrected = p.fstm / T(1 - p.beta1 ^ p.t)
scndm_corrected = p.scndm / T(1 - p.beta2 ^ p.t)
axpy!(-p.lr, (fstm_corrected ./ (sqrt.(scndm_corrected) .+ T(p.eps))), w)
end
function gclip!(g, gclip)
if gclip == 0
g
else
gnorm = norm(g)
if gnorm <= gclip
g
else
lmul!(gclip/gnorm, g)
end
end
end
"""
optimizers(model, otype; options...)
Given parameters of a `model`, initialize and return corresponding
optimization parameters for a given optimization type `otype` and
optimization options `options`. This is useful because each numeric
array in model needs its own distinct optimization
parameter. `optimizers` makes the creation of optimization parameters
that parallel model parameters easy when all of them use the same type
and options.
"""
function optimizers(x...; o...)
@warn "optimizers is deprecated, use sgd, adam etc. instead." maxlog=1
_optimizers(x...; o...)
end
_optimizers(::AbstractArray{<:Number},otype; o...) = otype(;o...)
_optimizers(a::AbstractDict,otype; o...)=Dict([ k=>_optimizers(v,otype;o...) for (k,v) in a ])
_optimizers(a::Tuple,otype; o...)=map(x->_optimizers(x,otype;o...), a)
_optimizers(a::AbstractArray,otype; o...)=map(x->_optimizers(x,otype;o...), a)
_optimizers(a,otype;o...)=nothing