Skip to content

[ML] Avoid zero size steps in L-BFGS #2078

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
* Correct ANOVA for Gaussian Process we fit to the loss surface. This affects early stopping.
Previously, we would always stop early whether it was approproate or not. It also improves
the estimates of hyperparameter importances. (See {ml-pull}2073[#2073].)
* Fix numerical instability in hyperparameter optimisation for training regression and
classification models. (See {ml-pull}2078[#2078].)

== {es} version 7.15.0

Expand Down
82 changes: 77 additions & 5 deletions include/maths/common/CLbfgs.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#define INCLUDED_ml_maths_common_CLbfgs_h

#include <maths/common/CLinearAlgebraShims.h>
#include <maths/common/CPRNG.h>
#include <maths/common/CSampling.h>
#include <maths/common/CSolvers.h>
#include <maths/common/CTools.h>

#include <boost/circular_buffer.hpp>
Expand Down Expand Up @@ -48,6 +51,10 @@ class CLbfgs {
//! The maximum number of iterations to use backtracking.
static const std::size_t MAXIMUM_BACK_TRACKING_ITERATIONS;

//! The maximum number of random probes we'll use to look for a direction in which
//! the objective decreases if the gradient function returns zero.
static const std::size_t MAXIMUM_RANDOM_PROBES;

public:
explicit CLbfgs(std::size_t rank,
double decrease = BACKTRACKING_MIN_DECREASE,
Expand Down Expand Up @@ -221,7 +228,8 @@ class CLbfgs {
}

static bool converged(double fx, double fl, double f0, double eps) {
return std::fabs(fx - fl) < eps * std::fabs(fx - f0);
// We use less than or equal so that this is true if fx == fl == f0.
return std::fabs(fx - fl) <= eps * std::fabs(fx - f0);
}

template<typename F>
Expand All @@ -232,8 +240,8 @@ class CLbfgs {
// size.

double s{1.0};
double fs{f(m_X - s * m_P)};
VECTOR xs;
VECTOR xs{m_X - m_P};
double fs{f(xs)};

for (std::size_t i = 0; i < MAXIMUM_BACK_TRACKING_ITERATIONS &&
fs - m_Fx > this->minimumDecrease(s);
Expand All @@ -243,10 +251,69 @@ class CLbfgs {
fs = f(xs);
}

// This can happen if s |m_P| < epsilon |m_X|.
if (las::distance(xs, m_X) == 0.0) {
this->fullLineSearch(f, xs, fs);
}

m_Fl = m_Fx;
m_Fx = fs;

return m_X - s * m_P;
return xs;
}

template<typename F>
void fullLineSearch(const F& f, VECTOR& xs, double& fs) {
VECTOR step{las::norm(m_P) == 0.0 ? this->randomStep(f)
: this->minimumStepSize() * las::norm(m_X) /
las::norm(m_P) * m_P};

if (las::norm(step) == 0.0) {
xs = m_X;
fs = m_Fx;
return;
}

// We probe a selection of points along the step direction looking for
// a minimum and polish up with a few iterations of Brent's method.

TDoubleVec probes(MAXIMUM_BACK_TRACKING_ITERATIONS);
probes[0] = 1.0;
for (std::size_t i = 1; i < MAXIMUM_BACK_TRACKING_ITERATIONS; ++i) {
probes[i] = probes[i - 1] / m_StepScale;
}

// We need to force the value of s_ * step into a temporary or else we
// hit an issue with Eigen expressions which causes the function values
// at all probes to be equal.
auto f_ = [&](double s_) {
VECTOR step_{s_ * step};
return f(m_X - step_);
};

double smin;
double fmin;
CSolvers::globalMinimize(probes, f_, smin, fmin);
xs = m_X - smin * step;
fs = f(xs);
}

template<typename F>
VECTOR randomStep(const F& f) {
for (std::size_t i = 0; i < MAXIMUM_RANDOM_PROBES; ++i) {
// Randomly probe looking for a decrease in the function.
VECTOR step{las::zero(m_X)};
double eps{this->minimumStepSize() * las::norm(m_X)};
for (std::size_t j = 0; j < las::dimension(m_X); ++j) {
step(j) = CSampling::uniformSample(m_Rng, -eps, eps);
}
if (f(m_X + step) < f(m_X)) {
return -step;
}
}
// We couldn't find a direction in which the fuction decreases. Returning
// zero means we'll exit on the convergence condition.
return las::zero(m_X);
}

template<typename G>
Expand Down Expand Up @@ -305,7 +372,9 @@ class CLbfgs {
}

double minimumDecrease(double s) const {
return m_BacktrackingMinDecrease * s * las::inner(m_Gx, m_P) / las::norm(m_P);
double normP{las::norm(m_P)};
return m_BacktrackingMinDecrease * s *
(normP == 0.0 ? las::norm(m_Gx) : las::inner(m_Gx, m_P) / normP);
}

double minimumStepSize() const {
Expand All @@ -317,6 +386,7 @@ class CLbfgs {
std::size_t m_Rank;
double m_StepScale;
double m_BacktrackingMinDecrease;
CPRNG::CXorOShiro128Plus m_Rng;
bool m_Initial = true;
double m_F0;
double m_Fl;
Expand All @@ -336,6 +406,8 @@ template<typename VECTOR>
const double CLbfgs<VECTOR>::STEP_SCALE{0.3};
template<typename VECTOR>
const std::size_t CLbfgs<VECTOR>::MAXIMUM_BACK_TRACKING_ITERATIONS{20};
template<typename VECTOR>
const std::size_t CLbfgs<VECTOR>::MAXIMUM_RANDOM_PROBES{10};
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa
BOOST_REQUIRE_CLOSE_ABSOLUTE(
expected,
result["row_results"]["results"]["ml"]["target_prediction"].GetDouble(),
0.15 * expected);
0.2 * expected);
BOOST_REQUIRE_EQUAL(
index < 40,
result["row_results"]["results"]["ml"]["is_training"].GetBool());
Expand Down
8 changes: 4 additions & 4 deletions lib/maths/analytics/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ BOOST_AUTO_TEST_CASE(testNonLinear) {
0.0, modelBias[i][0],
4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
// Good R^2...
BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97);
BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.96);

meanModelRSquared.add(modelRSquared[i][0]);
}
Expand Down Expand Up @@ -695,13 +695,13 @@ BOOST_AUTO_TEST_CASE(testHuber) {
0.0, modelBias[i],
4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
// Good R^2...
BOOST_TEST_REQUIRE(modelRSquared[i] > 0.96);
BOOST_TEST_REQUIRE(modelRSquared[i] > 0.95);

meanModelRSquared.add(modelRSquared[i]);
}

LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.97);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.96);
}

BOOST_AUTO_TEST_CASE(testMsle) {
Expand Down Expand Up @@ -766,7 +766,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {

// Unbiased...
BOOST_REQUIRE_CLOSE_ABSOLUTE(
0.0, bias, 4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
0.0, bias, 7.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
// Good R^2...
BOOST_TEST_REQUIRE(rSquared > 0.98);
}
Expand Down
48 changes: 47 additions & 1 deletion lib/maths/common/unittest/CLbfgsTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(testQuadtratic) {
}

auto f = [&](const TVector& x) {
return x.transpose() * diagonal.asDiagonal() * x;
return static_cast<double>(x.transpose() * diagonal.asDiagonal() * x);
};
auto g = [&](const TVector& x) { return 2.0 * diagonal.asDiagonal() * x; };

Expand Down Expand Up @@ -232,4 +232,50 @@ BOOST_AUTO_TEST_CASE(testConstrainedMinimize) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, ferror, 1e-5);
}

BOOST_AUTO_TEST_CASE(testMinimizeWithVerySmallGradient) {

// We test a function such whose gradient is less than epsilon * x0.

double eps{1e-16};

TVector xmin{TVector::fromStdVector({1000.0, 1000.0, 1000.0})};
TVector x0{TVector::fromStdVector({1200.0, 1200.0, 1200.0})};

auto f = [&](const TVector& x) {
return eps * (x - xmin).transpose() * (x - xmin);
};
auto g = [&](const TVector& x) { return 2.0 * eps * (x - xmin); };

maths::common::CLbfgs<TVector> lbfgs{10};

TVector x;
double fx;
std::tie(x, fx) = lbfgs.minimize(f, g, x0, 1e-20);

BOOST_TEST_REQUIRE((xmin - x).norm() < 1e-6);
}

BOOST_AUTO_TEST_CASE(testMinimizeWithInitialZeroGradient) {

// Test random probing by forcing the gradient to zero at x0.

TVector zero{TVector::fromStdVector({0.0, 0.0, 0.0})};

TVector xmin{TVector::fromStdVector({1000.0, 1000.0, 1000.0})};
TVector x0{TVector::fromStdVector({1200.0, 1200.0, 1200.0})};

auto f = [&](const TVector& x) {
return (x - xmin).transpose() * (x - xmin);
};
auto g = [&](const TVector& x) { return x == x0 ? zero : 2.0 * (x - xmin); };

maths::common::CLbfgs<TVector> lbfgs{10};

TVector x;
double fx;
std::tie(x, fx) = lbfgs.minimize(f, g, x0, 1e-20);

BOOST_TEST_REQUIRE((xmin - x).norm() < 1e-6);
}

BOOST_AUTO_TEST_SUITE_END()