From 4c54adf387146aa3d207892d114b6eeea879c8e5 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 19 Oct 2021 13:13:01 +0100
Subject: [PATCH 1/6] Handle divide by zero cases

---
 include/maths/common/CLbfgs.h           | 82 +++++++++++++++++++++++--
 lib/maths/common/unittest/CLbfgsTest.cc | 48 ++++++++++++++-
 2 files changed, 124 insertions(+), 6 deletions(-)
diff --git a/include/maths/common/CLbfgs.h b/include/maths/common/CLbfgs.h
index 7b6cdc350c..897380058b 100644
--- a/include/maths/common/CLbfgs.h
+++ b/include/maths/common/CLbfgs.h
@@ -13,6 +13,9 @@
 #define INCLUDED_ml_maths_common_CLbfgs_h
 
 #include <maths/common/CLinearAlgebraShims.h>
+#include <maths/common/CPRNG.h>
+#include <maths/common/CSampling.h>
+#include <maths/common/CSolvers.h>
 #include <maths/common/CTools.h>
 
 #include <boost/circular_buffer.hpp>
@@ -48,6 +51,10 @@ class CLbfgs {
     //! The maximum number of iterations to use backtracking.
     static const std::size_t MAXIMUM_BACK_TRACKING_ITERATIONS;
 
+    //! The maximum number of random probes we'll use to look for a direction in which
+    //! the objective decreases if the gradient function returns zero.
+    static const std::size_t MAXIMUM_RANDOM_PROBES;
+
 public:
     explicit CLbfgs(std::size_t rank,
                     double decrease = BACKTRACKING_MIN_DECREASE,
@@ -221,7 +228,8 @@ class CLbfgs {
     }
 
     static bool converged(double fx, double fl, double f0, double eps) {
-        return std::fabs(fx - fl) < eps * std::fabs(fx - f0);
+        // We use less than or equal so that this is true if fx == fl == f0.
+        return std::fabs(fx - fl) <= eps * std::fabs(fx - f0);
     }
 
     template<typename F>
@@ -232,8 +240,8 @@ class CLbfgs {
         // size.
 
         double s{1.0};
-        double fs{f(m_X - s * m_P)};
-        VECTOR xs;
+        VECTOR xs{m_X - m_P};
+        double fs{f(xs)};
 
         for (std::size_t i = 0; i < MAXIMUM_BACK_TRACKING_ITERATIONS &&
                                 fs - m_Fx > this->minimumDecrease(s);
@@ -243,10 +251,69 @@ class CLbfgs {
             fs = f(xs);
         }
 
+        // This can happen if |s m_P| < epsilon |m_X|.
+        if (las::distance(xs, m_X) == 0.0) {
+            this->fullLineSearch(f, xs, fs);
+        }
+
         m_Fl = m_Fx;
         m_Fx = fs;
 
-        return m_X - s * m_P;
+        return xs;
+    }
+
+    template<typename F>
+    void fullLineSearch(const F& f, VECTOR& xs, double& fs) {
+        VECTOR step{las::norm(m_P) == 0.0 ? this->randomStep(f)
+                                          : this->minimumStepSize() * las::norm(m_X) /
+                                                las::norm(m_P) * m_P};
+
+        if (las::norm(step) == 0.0) {
+            xs = m_X;
+            fs = m_Fx;
+            return;
+        }
+
+        // We probe a selection of points along the step direction looking for
+        // a minimum and polish up with a few iterations of Brent's method.
+
+        TDoubleVec probes(MAXIMUM_BACK_TRACKING_ITERATIONS);
+        probes[0] = 1.0;
+        for (std::size_t i = 1; i < MAXIMUM_BACK_TRACKING_ITERATIONS; ++i) {
+            probes[i] = probes[i - 1] / m_StepScale;
+        }
+
+        // We need to force the value of s_ * step into a temporary or else we
+        // hit an issue with Eigen expressions which causes the function values
+        // at all probes to be equal.
+        auto f_ = [&](double s_) {
+            VECTOR step_{s_ * step};
+            return f(m_X - step_);
+        };
+
+        double smin;
+        double fmin;
+        CSolvers::globalMinimize(probes, f_, smin, fmin);
+        xs = m_X - smin * step;
+        fs = f(xs);
+    }
+
+    template<typename F>
+    VECTOR randomStep(const F& f) {
+        for (std::size_t i = 0; i < MAXIMUM_RANDOM_PROBES; ++i) {
+            // Randomly probe looking for a decrease in the function.
+            VECTOR step{las::zero(m_X)};
+            double eps{this->minimumStepSize() * las::norm(m_X)};
+            for (std::size_t j = 0; j < las::dimension(m_X); ++j) {
+                step(j) = CSampling::uniformSample(m_Rng, -eps, eps);
+            }
+            if (f(m_X + step) < f(m_X)) {
+                return -step;
+            }
+        }
+        // We could find a direction in which the fuction decreases. Returning
+        // zero means we'll exit on the convergence condition.
+        return las::zero(m_X);
     }
 
     template<typename G>
@@ -305,7 +372,9 @@ class CLbfgs {
     }
 
     double minimumDecrease(double s) const {
-        return m_BacktrackingMinDecrease * s * las::inner(m_Gx, m_P) / las::norm(m_P);
+        double normP{las::norm(m_P)};
+        return m_BacktrackingMinDecrease * s *
+               (normP == 0.0 ? las::norm(m_Gx) : las::inner(m_Gx, m_P) / normP);
     }
 
     double minimumStepSize() const {
@@ -317,6 +386,7 @@ class CLbfgs {
     std::size_t m_Rank;
     double m_StepScale;
     double m_BacktrackingMinDecrease;
+    CPRNG::CXorOShiro128Plus m_Rng;
     bool m_Initial = true;
     double m_F0;
     double m_Fl;
@@ -336,6 +406,8 @@ template<typename VECTOR>
 const double CLbfgs<VECTOR>::STEP_SCALE{0.3};
 template<typename VECTOR>
 const std::size_t CLbfgs<VECTOR>::MAXIMUM_BACK_TRACKING_ITERATIONS{20};
+template<typename VECTOR>
+const std::size_t CLbfgs<VECTOR>::MAXIMUM_RANDOM_PROBES{10};
 }
 }
 }
diff --git a/lib/maths/common/unittest/CLbfgsTest.cc b/lib/maths/common/unittest/CLbfgsTest.cc
index 8ad3b31be7..4c82f5dc19 100644
--- a/lib/maths/common/unittest/CLbfgsTest.cc
+++ b/lib/maths/common/unittest/CLbfgsTest.cc
@@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(testQuadtratic) {
     }
 
     auto f = [&](const TVector& x) {
-        return x.transpose() * diagonal.asDiagonal() * x;
+        return static_cast<double>(x.transpose() * diagonal.asDiagonal() * x);
     };
     auto g = [&](const TVector& x) { return 2.0 * diagonal.asDiagonal() * x; };
 
@@ -232,4 +232,50 @@ BOOST_AUTO_TEST_CASE(testConstrainedMinimize) {
     BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, ferror, 1e-5);
 }
 
+BOOST_AUTO_TEST_CASE(testMinimizeWithVerySmallGradient) {
+
+    // We test a function such whose gradient is less than epsilon * x0.
+
+    double eps{1e-16};
+
+    TVector xmin{TVector::fromStdVector({1000.0, 1000.0, 1000.0})};
+    TVector x0{TVector::fromStdVector({1200.0, 1200.0, 1200.0})};
+
+    auto f = [&](const TVector& x) {
+        return eps * (x - xmin).transpose() * (x - xmin);
+    };
+    auto g = [&](const TVector& x) { return 2.0 * eps * (x - xmin); };
+
+    maths::CLbfgs<TVector> lbfgs{10};
+
+    TVector x;
+    double fx;
+    std::tie(x, fx) = lbfgs.minimize(f, g, x0, 1e-20);
+
+    BOOST_TEST_REQUIRE((xmin - x).norm() < 1e-6);
+}
+
+BOOST_AUTO_TEST_CASE(testMinimizeWithInitialZeroGradient) {
+
+    // Test random probing by forcing the gradient to zero at x0.
+
+    TVector zero{TVector::fromStdVector({0.0, 0.0, 0.0})};
+
+    TVector xmin{TVector::fromStdVector({1000.0, 1000.0, 1000.0})};
+    TVector x0{TVector::fromStdVector({1200.0, 1200.0, 1200.0})};
+
+    auto f = [&](const TVector& x) {
+        return (x - xmin).transpose() * (x - xmin);
+    };
+    auto g = [&](const TVector& x) { return x == x0 ? zero : 2.0 * (x - xmin); };
+
+    maths::CLbfgs<TVector> lbfgs{10};
+
+    TVector x;
+    double fx;
+    std::tie(x, fx) = lbfgs.minimize(f, g, x0, 1e-20);
+
+    BOOST_TEST_REQUIRE((xmin - x).norm() < 1e-6);
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 4d908d31cb8154273f505e7a843b0d75044c6a71 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 19 Oct 2021 13:27:20 +0100
Subject: [PATCH 2/6] Docs

---
 docs/CHANGELOG.asciidoc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
index 51296d71f7..d97b44f3e2 100644
--- a/docs/CHANGELOG.asciidoc
+++ b/docs/CHANGELOG.asciidoc
@@ -60,6 +60,11 @@
 * Make sure instrumentation captures the best hyperparameters we found for training
   classification and regression models. (See {ml-pull}2057{#2057}.)
 
+=== Bug Fixes
+
+* Fix numerical instability in hyperparameter optimisation for training regression and
+  classification models. (See {ml-pull}2078[#2078].)
+
 == {es} version 7.15.0
 
 === Enhancements

From a7f0c5e2f4976fc4a01c72dc69e4adef306e5cff Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 19 Oct 2021 14:36:12 +0100
Subject: [PATCH 3/6] Test thresholds

---
 include/maths/common/CLbfgs.h                    | 2 +-
 lib/maths/analytics/unittest/CBoostedTreeTest.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/maths/common/CLbfgs.h b/include/maths/common/CLbfgs.h
index 897380058b..1fbf71b360 100644
--- a/include/maths/common/CLbfgs.h
+++ b/include/maths/common/CLbfgs.h
@@ -311,7 +311,7 @@ class CLbfgs {
                 return -step;
             }
         }
-        // We could find a direction in which the fuction decreases. Returning
+        // We couldn't find a direction in which the fuction decreases. Returning
         // zero means we'll exit on the convergence condition.
         return las::zero(m_X);
     }
diff --git a/lib/maths/analytics/unittest/CBoostedTreeTest.cc b/lib/maths/analytics/unittest/CBoostedTreeTest.cc
index f0471a3eed..1ca79890c1 100644
--- a/lib/maths/analytics/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/analytics/unittest/CBoostedTreeTest.cc
@@ -632,7 +632,7 @@ BOOST_AUTO_TEST_CASE(testNonLinear) {
             0.0, modelBias[i][0],
             4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
         // Good R^2...
-        BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97);
+        BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.96);
 
         meanModelRSquared.add(modelRSquared[i][0]);
     }
@@ -701,7 +701,7 @@ BOOST_AUTO_TEST_CASE(testHuber) {
     }
 
     LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared));
-    BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.97);
+    BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.96);
 }
 
 BOOST_AUTO_TEST_CASE(testMsle) {

From 79400ff5f6e545933c2953a2f9c41dd03aa380dd Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 19 Oct 2021 15:44:41 +0100
Subject: [PATCH 4/6] Build fix

---
 lib/maths/common/unittest/CLbfgsTest.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/maths/common/unittest/CLbfgsTest.cc b/lib/maths/common/unittest/CLbfgsTest.cc
index 4c82f5dc19..d207aeb5ee 100644
--- a/lib/maths/common/unittest/CLbfgsTest.cc
+++ b/lib/maths/common/unittest/CLbfgsTest.cc
@@ -246,7 +246,7 @@ BOOST_AUTO_TEST_CASE(testMinimizeWithVerySmallGradient) {
     };
     auto g = [&](const TVector& x) { return 2.0 * eps * (x - xmin); };
 
-    maths::CLbfgs<TVector> lbfgs{10};
+    maths::common::CLbfgs<TVector> lbfgs{10};
 
     TVector x;
     double fx;
@@ -269,7 +269,7 @@ BOOST_AUTO_TEST_CASE(testMinimizeWithInitialZeroGradient) {
     };
     auto g = [&](const TVector& x) { return x == x0 ? zero : 2.0 * (x - xmin); };
 
-    maths::CLbfgs<TVector> lbfgs{10};
+    maths::common::CLbfgs<TVector> lbfgs{10};
 
     TVector x;
     double fx;

From c2e1cc1c8ed848d0d35fbe4f7e58885eb8fd3454 Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 19 Oct 2021 17:52:29 +0100
Subject: [PATCH 5/6] Test thresholds for ARM

---
 lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc | 2 +-
 lib/maths/analytics/unittest/CBoostedTreeTest.cc   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
index 11d13e2dc5..6a380162b7 100644
--- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
+++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc
@@ -573,7 +573,7 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithRowsMissingTargetVa
             BOOST_REQUIRE_CLOSE_ABSOLUTE(
                 expected,
                 result["row_results"]["results"]["ml"]["target_prediction"].GetDouble(),
-                0.15 * expected);
+                0.2 * expected);
             BOOST_REQUIRE_EQUAL(
                 index < 40,
                 result["row_results"]["results"]["ml"]["is_training"].GetBool());
diff --git a/lib/maths/analytics/unittest/CBoostedTreeTest.cc b/lib/maths/analytics/unittest/CBoostedTreeTest.cc
index 1ca79890c1..427bd9a1a0 100644
--- a/lib/maths/analytics/unittest/CBoostedTreeTest.cc
+++ b/lib/maths/analytics/unittest/CBoostedTreeTest.cc
@@ -695,7 +695,7 @@ BOOST_AUTO_TEST_CASE(testHuber) {
             0.0, modelBias[i],
             4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
         // Good R^2...
-        BOOST_TEST_REQUIRE(modelRSquared[i] > 0.96);
+        BOOST_TEST_REQUIRE(modelRSquared[i] > 0.95);
 
         meanModelRSquared.add(modelRSquared[i]);
     }
@@ -766,7 +766,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) {
 
     // Unbiased...
     BOOST_REQUIRE_CLOSE_ABSOLUTE(
-        0.0, bias, 4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
+        0.0, bias, 7.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows)));
     // Good R^2...
     BOOST_TEST_REQUIRE(rSquared > 0.98);
 }

From bfb10143a16198d1ab10ce00519102c18614c80a Mon Sep 17 00:00:00 2001
From: Tom Veasey <tveasey@elastic.co>
Date: Tue, 19 Oct 2021 18:11:03 +0100
Subject: [PATCH 6/6] Comment

---
 include/maths/common/CLbfgs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/maths/common/CLbfgs.h b/include/maths/common/CLbfgs.h
index 1fbf71b360..e6d8940806 100644
--- a/include/maths/common/CLbfgs.h
+++ b/include/maths/common/CLbfgs.h
@@ -251,7 +251,7 @@ class CLbfgs {
             fs = f(xs);
         }
 
-        // This can happen if |s m_P| < epsilon |m_X|.
+        // This can happen if s |m_P| < epsilon |m_X|.
         if (las::distance(xs, m_X) == 0.0) {
             this->fullLineSearch(f, xs, fs);
         }