elastic · tveasey · Oct 20, 2021 · Oct 19, 2021 · Oct 19, 2021 · Oct 19, 2021
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -65,6 +65,8 @@
 * Correct ANOVA for Gaussian Process we fit to the loss surface. This affects early stopping.
   Previously, we would always stop early whether it was approproate or not. It also improves
   the estimates of hyperparameter importances. (See {ml-pull}2073[#2073].)
+* Fix numerical instability in hyperparameter optimisation for training regression and
+  classification models. (See {ml-pull}2078[#2078].)
 
 == {es} version 7.15.0
 

diff --git a/include/maths/common/CLbfgs.h b/include/maths/common/CLbfgs.h
@@ -13,6 +13,9 @@
 #define INCLUDED_ml_maths_common_CLbfgs_h
 
 #include <maths/common/CLinearAlgebraShims.h>
+#include <maths/common/CPRNG.h>
+#include <maths/common/CSampling.h>
+#include <maths/common/CSolvers.h>
 #include <maths/common/CTools.h>
 
 #include <boost/circular_buffer.hpp>
@@ -48,6 +51,10 @@ class CLbfgs {
     //! The maximum number of iterations to use backtracking.
     static const std::size_t MAXIMUM_BACK_TRACKING_ITERATIONS;
 
+    //! The maximum number of random probes we'll use to look for a direction in which
+    //! the objective decreases if the gradient function returns zero.
+    static const std::size_t MAXIMUM_RANDOM_PROBES;
+
 public:
     explicit CLbfgs(std::size_t rank,
                     double decrease = BACKTRACKING_MIN_DECREASE,
@@ -221,7 +228,8 @@ class CLbfgs {
     }
 
     static bool converged(double fx, double fl, double f0, double eps) {
-        return std::fabs(fx - fl) < eps * std::fabs(fx - f0);
+        // We use less than or equal so that this is true if fx == fl == f0.
+        return std::fabs(fx - fl) <= eps * std::fabs(fx - f0);
     }
 
     template<typename F>
@@ -232,8 +240,8 @@ class CLbfgs {
         // size.
 
         double s{1.0};
-        double fs{f(m_X - s * m_P)};
-        VECTOR xs;
+        VECTOR xs{m_X - m_P};
+        double fs{f(xs)};
 
         for (std::size_t i = 0; i < MAXIMUM_BACK_TRACKING_ITERATIONS &&
                                 fs - m_Fx > this->minimumDecrease(s);
@@ -243,10 +251,69 @@ class CLbfgs {
             fs = f(xs);
         }
 
+        // This can happen if s |m_P| < epsilon |m_X|.
+        if (las::distance(xs, m_X) == 0.0) {
+            this->fullLineSearch(f, xs, fs);
+        }
+
         m_Fl = m_Fx;
         m_Fx = fs;
 
-        return m_X - s * m_P;
+        return xs;
+    }
+
+    template<typename F>
+    void fullLineSearch(const F& f, VECTOR& xs, double& fs) {
+        VECTOR step{las::norm(m_P) == 0.0 ? this->randomStep(f)
+                                          : this->minimumStepSize() * las::norm(m_X) /
+                                                las::norm(m_P) * m_P};
+
+        if (las::norm(step) == 0.0) {
+            xs = m_X;
+            fs = m_Fx;
+            return;
+        }
+
+        // We probe a selection of points along the step direction looking for
+        // a minimum and polish up with a few iterations of Brent's method.
+
+        TDoubleVec probes(MAXIMUM_BACK_TRACKING_ITERATIONS);
+        probes[0] = 1.0;
+        for (std::size_t i = 1; i < MAXIMUM_BACK_TRACKING_ITERATIONS; ++i) {
+            probes[i] = probes[i - 1] / m_StepScale;
+        }
+
+        // We need to force the value of s_ * step into a temporary or else we
+        // hit an issue with Eigen expressions which causes the function values
+        // at all probes to be equal.
+        auto f_ = [&](double s_) {
+            VECTOR step_{s_ * step};
+            return f(m_X - step_);
+        };
+
+        double smin;
+        double fmin;
+        CSolvers::globalMinimize(probes, f_, smin, fmin);
+        xs = m_X - smin * step;
+        fs = f(xs);
+    }
+
+    template<typename F>
+    VECTOR randomStep(const F& f) {
+        for (std::size_t i = 0; i < MAXIMUM_RANDOM_PROBES; ++i) {
+            // Randomly probe looking for a decrease in the function.
+            VECTOR step{las::zero(m_X)};
+            double eps{this->minimumStepSize() * las::norm(m_X)};
+            for (std::size_t j = 0; j < las::dimension(m_X); ++j) {
+                step(j) = CSampling::uniformSample(m_Rng, -eps, eps);
+            }
+            if (f(m_X + step) < f(m_X)) {
+                return -step;
+            }
+        }
+        // We couldn't find a direction in which the fuction decreases. Returning
+        // zero means we'll exit on the convergence condition.
+        return las::zero(m_X);
     }
 
     template<typename G>
@@ -305,7 +372,9 @@ class CLbfgs {
     }
 
     double minimumDecrease(double s) const {
-        return m_BacktrackingMinDecrease * s * las::inner(m_Gx, m_P) / las::norm(m_P);
+        double normP{las::norm(m_P)};
+        return m_BacktrackingMinDecrease * s *
+               (normP == 0.0 ? las::norm(m_Gx) : las::inner(m_Gx, m_P) / normP);
     }
 
     double minimumStepSize() const {
@@ -317,6 +386,7 @@ class CLbfgs {
     std::size_t m_Rank;
     double m_StepScale;
     double m_BacktrackingMinDecrease;
+    CPRNG::CXorOShiro128Plus m_Rng;
     bool m_Initial = true;
     double m_F0;
     double m_Fl;
@@ -336,6 +406,8 @@ template<typename VECTOR>
 const double CLbfgs<VECTOR>::STEP_SCALE{0.3};
 template<typename VECTOR>
 const std::size_t CLbfgs<VECTOR>::MAXIMUM_BACK_TRACKING_ITERATIONS{20};
+template<typename VECTOR>
+const std::size_t CLbfgs<VECTOR>::MAXIMUM_RANDOM_PROBES{10};
 }
 }
 }

diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
@@ -573,7 +573,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa
             BOOST_REQUIRE_CLOSE_ABSOLUTE(
                 expected,
                 result["row_results"]["results"]["ml"]["target_prediction"].GetDouble(),
-                0.15 * expected);
+                0.2 * expected);
             BOOST_REQUIRE_EQUAL(
                 index < 40,
                 result["row_results"]["results"]["ml"]["is_training"].GetBool());

diff --git a/lib/maths/analytics/unittest/CBoostedTreeTest.cc b/lib/maths/analytics/unittest/CBoostedTreeTest.cc
@@ -632,7 +632,7 @@ BOOST_AUTO_TEST_CASE(testNonLinear) {
             0.0, modelBias[i][0],
             4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
         // Good R^2...
-        BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97);
+        BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.96);
 
         meanModelRSquared.add(modelRSquared[i][0]);
     }
@@ -695,13 +695,13 @@ BOOST_AUTO_TEST_CASE(testHuber) {
             0.0, modelBias[i],
             4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
         // Good R^2...
-        BOOST_TEST_REQUIRE(modelRSquared[i] > 0.96);
+        BOOST_TEST_REQUIRE(modelRSquared[i] > 0.95);
 
         meanModelRSquared.add(modelRSquared[i]);
     }
 
     LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared));
-    BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.97);
+    BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.96);
 }
 
 BOOST_AUTO_TEST_CASE(testMsle) {
@@ -766,7 +766,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {
 
     // Unbiased...
     BOOST_REQUIRE_CLOSE_ABSOLUTE(
-        0.0, bias, 4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
+        0.0, bias, 7.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
     // Good R^2...
     BOOST_TEST_REQUIRE(rSquared > 0.98);
 }

diff --git a/lib/maths/common/unittest/CLbfgsTest.cc b/lib/maths/common/unittest/CLbfgsTest.cc
@@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(testQuadtratic) {
     }
 
     auto f = [&](const TVector& x) {
-        return x.transpose() * diagonal.asDiagonal() * x;
+        return static_cast<double>(x.transpose() * diagonal.asDiagonal() * x);
     };
     auto g = [&](const TVector& x) { return 2.0 * diagonal.asDiagonal() * x; };
 
@@ -232,4 +232,50 @@ BOOST_AUTO_TEST_CASE(testConstrainedMinimize) {
     BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, ferror, 1e-5);
 }
 
+BOOST_AUTO_TEST_CASE(testMinimizeWithVerySmallGradient) {
+
+    // We test a function such whose gradient is less than epsilon * x0.
+
+    double eps{1e-16};
+
+    TVector xmin{TVector::fromStdVector({1000.0, 1000.0, 1000.0})};
+    TVector x0{TVector::fromStdVector({1200.0, 1200.0, 1200.0})};
+
+    auto f = [&](const TVector& x) {
+        return eps * (x - xmin).transpose() * (x - xmin);
+    };
+    auto g = [&](const TVector& x) { return 2.0 * eps * (x - xmin); };
+
+    maths::common::CLbfgs<TVector> lbfgs{10};
+
+    TVector x;
+    double fx;
+    std::tie(x, fx) = lbfgs.minimize(f, g, x0, 1e-20);
+
+    BOOST_TEST_REQUIRE((xmin - x).norm() < 1e-6);
+}
+
+BOOST_AUTO_TEST_CASE(testMinimizeWithInitialZeroGradient) {
+
+    // Test random probing by forcing the gradient to zero at x0.
+
+    TVector zero{TVector::fromStdVector({0.0, 0.0, 0.0})};
+
+    TVector xmin{TVector::fromStdVector({1000.0, 1000.0, 1000.0})};
+    TVector x0{TVector::fromStdVector({1200.0, 1200.0, 1200.0})};
+
+    auto f = [&](const TVector& x) {
+        return (x - xmin).transpose() * (x - xmin);
+    };
+    auto g = [&](const TVector& x) { return x == x0 ? zero : 2.0 * (x - xmin); };
+
+    maths::common::CLbfgs<TVector> lbfgs{10};
+
+    TVector x;
+    double fx;
+    std::tie(x, fx) = lbfgs.minimize(f, g, x0, 1e-20);
+
+    BOOST_TEST_REQUIRE((xmin - x).norm() < 1e-6);
+}
+
 BOOST_AUTO_TEST_SUITE_END()