Skip to content
This repository was archived by the owner on Nov 23, 2018. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 76 additions & 79 deletions lbfgs.go
Original file line number Diff line number Diff line change
@@ -1,52 +1,60 @@
// Copyright ©2014 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package optimize

import (
"github.com/gonum/floats"
)

// LBFGS implements the limited-memory BFGS algorithm. While the normal BFGS algorithm
// makes a full approximation to the inverse hessian, LBFGS instead approximates the
// hessian from the last Store optimization steps. The Store parameter is a tradeoff
// between cost of the method and accuracy of the hessian approximation.
// LBFGS has a cost (both in memory and time) of O(Store * inputDimension).
// Since BFGS has a cost of O(inputDimension^2), LBFGS is more appropriate
// for very large problems. This "forgetful" nature of LBFGS may also make it perform
// better than BFGS for functions with Hessians that vary rapidly spatially.
// LBFGS implements the limited-memory BFGS method for gradient-based
// unconstrained minimization.
//
// If Store is 0, Store is defaulted to 15.
// A Linesearcher for LBFGS must satisfy the strong Wolfe conditions at every
// iteration. If Linesearcher == nil, an appropriate default is chosen.
// It stores a modified version of the inverse Hessian approximation H
// implicitly from the last Store iterations while the normal BFGS method
// stores and manipulates H directly as a dense matrix. Therefore LBFGS is more
// appropriate than BFGS for large problems as the cost of LBFGS scales as
// O(Store * dim) while BFGS scales as O(dim^2). The "forgetful" nature of
// LBFGS may also make it perform better than BFGS for functions with Hessians
// that vary rapidly spatially.
type LBFGS struct {
// Linesearcher selects suitable steps along the descent direction.
// Accepted steps should satisfy the strong Wolfe conditions.
// If Linesearcher is nil, a reasonable default will be chosen.
Linesearcher Linesearcher
Store int // how many past iterations to store
// Store is the size of the limited-memory storage.
// If Store is 0, it will be defaulted to 15.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should choose one of is and ==. @kortschak the preferred is english in comments, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Store int

ls *LinesearchMethod

dim int
oldest int // element of the history slices that is the oldest

x []float64 // location at the last major iteration
grad []float64 // gradient at the last major iteration

y []float64 // holds g_{k+1} - g_k
s []float64 // holds x_{k+1} - x_k
a []float64 // holds cache of hessian updates
dim int // Dimension of the problem
x []float64 // Location at the last major iteration
grad []float64 // Gradient at the last major iteration

// History
yHist [][]float64 // last Store iterations of y
sHist [][]float64 // last Store iterations of s
rhoHist []float64 // last Store iterations of rho
oldest int // Index of the oldest element of the history
y [][]float64 // Last Store values of y
s [][]float64 // Last Store values of s
rho []float64 // Last Store values of rho
a []float64 // Cache of Hessian updates
}

func (l *LBFGS) Init(loc *Location) (Operation, error) {
if l.Linesearcher == nil {
l.Linesearcher = &Bisection{}
}
if l.Store == 0 {
l.Store = 15
}

if l.ls == nil {
l.ls = &LinesearchMethod{}
}
l.ls.Linesearcher = l.Linesearcher
l.ls.NextDirectioner = l

return l.ls.Init(loc)
}

Expand All @@ -57,53 +65,44 @@ func (l *LBFGS) Iterate(loc *Location) (Operation, error) {
func (l *LBFGS) InitDirection(loc *Location, dir []float64) (stepSize float64) {
dim := len(loc.X)
l.dim = dim
l.oldest = 0

if l.Store == 0 {
l.Store = 15
}

l.oldest = l.Store - 1 // the first vector will be put in at 0
l.a = resize(l.a, l.Store)
l.rho = resize(l.rho, l.Store)
l.y = l.initHistory(l.y)
l.s = l.initHistory(l.s)

l.x = resize(l.x, dim)
l.grad = resize(l.grad, dim)
copy(l.x, loc.X)

l.grad = resize(l.grad, dim)
copy(l.grad, loc.Gradient)

l.y = resize(l.y, dim)
l.s = resize(l.s, dim)
l.a = resize(l.a, l.Store)
l.rhoHist = resize(l.rhoHist, l.Store)
copy(dir, loc.Gradient)
floats.Scale(-1, dir)
return 1 / floats.Norm(dir, 2)
}

if cap(l.yHist) < l.Store {
n := make([][]float64, l.Store-cap(l.yHist))
l.yHist = append(l.yHist, n...)
func (l *LBFGS) initHistory(hist [][]float64) [][]float64 {
c := cap(hist)
if c < l.Store {
n := make([][]float64, l.Store-c)
hist = append(hist[:c], n...)
}
if cap(l.sHist) < l.Store {
n := make([][]float64, l.Store-cap(l.sHist))
l.sHist = append(l.sHist, n...)
}
l.yHist = l.yHist[:l.Store]
l.sHist = l.sHist[:l.Store]
for i := range l.sHist {
l.sHist[i] = resize(l.sHist[i], dim)
for j := range l.sHist[i] {
l.sHist[i][j] = 0
hist = hist[:l.Store]
for i := range hist {
hist[i] = resize(hist[i], l.dim)
for j := range hist[i] {
hist[i][j] = 0
}
}
for i := range l.yHist {
l.yHist[i] = resize(l.yHist[i], dim)
for j := range l.yHist[i] {
l.yHist[i][j] = 0
}
}

copy(dir, loc.Gradient)
floats.Scale(-1, dir)

return 1 / floats.Norm(dir, 2)
return hist
}

func (l *LBFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) {
// Uses two-loop correction as described in
// Nocedal, J., Wright, S.: Numerical Optimization (2nd ed). Springer (2006), chapter 7, page 178.

if len(loc.X) != l.dim {
panic("lbfgs: unexpected size mismatch")
}
Expand All @@ -114,46 +113,44 @@ func (l *LBFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) {
panic("lbfgs: unexpected size mismatch")
}

// Update direction. Uses two-loop correction as described in
// Nocedal, Wright (2006), Numerical Optimization (2nd ed.). Chapter 7, page 178.
copy(dir, loc.Gradient)
floats.SubTo(l.y, loc.Gradient, l.grad)
floats.SubTo(l.s, loc.X, l.x)
copy(l.sHist[l.oldest], l.s)
copy(l.yHist[l.oldest], l.y)
sDotY := floats.Dot(l.y, l.s)
l.rhoHist[l.oldest] = 1 / sDotY

l.oldest++
l.oldest = l.oldest % l.Store
y := l.y[l.oldest]
floats.SubTo(y, loc.Gradient, l.grad)
s := l.s[l.oldest]
floats.SubTo(s, loc.X, l.x)
sDotY := floats.Dot(s, y)
l.rho[l.oldest] = 1 / sDotY

l.oldest = (l.oldest + 1) % l.Store

copy(l.x, loc.X)
copy(l.grad, loc.Gradient)
copy(dir, loc.Gradient)

// two loop update. First loop starts with the most recent element
// and goes backward, second starts with the oldest element and goes
// forward. At the end have computed H^-1 * g, so flip the direction for
// minimization.
// Start with the most recent element and go backward,
for i := 0; i < l.Store; i++ {
idx := l.oldest - i - 1
if idx < 0 {
idx += l.Store
}
l.a[idx] = l.rhoHist[idx] * floats.Dot(l.sHist[idx], dir)
floats.AddScaled(dir, -l.a[idx], l.yHist[idx])
l.a[idx] = l.rho[idx] * floats.Dot(l.s[idx], dir)
floats.AddScaled(dir, -l.a[idx], l.y[idx])
}

// Scale the initial Hessian.
gamma := sDotY / floats.Dot(l.y, l.y)
gamma := sDotY / floats.Dot(y, y)
floats.Scale(gamma, dir)

// Start with the oldest element and go forward.
for i := 0; i < l.Store; i++ {
idx := i + l.oldest
if idx >= l.Store {
idx -= l.Store
}
beta := l.rhoHist[idx] * floats.Dot(l.yHist[idx], dir)
floats.AddScaled(dir, l.a[idx]-beta, l.sHist[idx])
beta := l.rho[idx] * floats.Dot(l.y[idx], dir)
floats.AddScaled(dir, l.a[idx]-beta, l.s[idx])
}

// dir contains H^{-1} * g, so flip the direction for minimization.
floats.Scale(-1, dir)

return 1
Expand Down