From 2adce69651b88b8bd033617b44fe071975bc0232 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 22 Feb 2021 10:14:56 +0000 Subject: [PATCH 01/35] WIP --- include/maths/CBoostedTreeFactory.h | 2 - include/maths/CLowess.h | 89 +++++++ include/maths/CLowessDetail.h | 337 +++++++++++++++++++++++++ include/maths/CMixtureDistribution.h | 1 + include/maths/CSolvers.h | 11 +- lib/maths/CBoostedTreeFactory.cc | 173 +++---------- lib/maths/unittest/CLowessTest.cc | 262 +++++++++++++++++++ lib/maths/unittest/COneOfNPriorTest.cc | 1 + lib/maths/unittest/CSolversTest.cc | 1 + lib/maths/unittest/Makefile | 1 + lib/maths/unittest/TestUtils.cc | 1 + 11 files changed, 740 insertions(+), 139 deletions(-) create mode 100644 include/maths/CLowess.h create mode 100644 include/maths/CLowessDetail.h create mode 100644 lib/maths/unittest/CLowessTest.cc diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 53fc2982bb..02564e594f 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -215,8 +215,6 @@ class MATHS_EXPORT CBoostedTreeFactory final { const TApplyParameter& applyParameterStep, double intervalLeftEnd, double intervalRightEnd, - double returnedIntervalLeftEndOffset, - double returnedIntervalRightEndOffset, const TAdjustTestLoss& adjustTestLoss = noopAdjustTestLoss) const; //! Initialize the state for hyperparameter optimisation. diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h new file mode 100644 index 0000000000..c440f497f8 --- /dev/null +++ b/include/maths/CLowess.h @@ -0,0 +1,89 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_maths_CLowess_h +#define INCLUDED_ml_maths_CLowess_h + +#include + +#include + +#include +#include + +namespace ml { +namespace maths { + +//! \brief LOWESS regression using order N polynomial. +//! +//! DESCRIPTION:\n +//! For more details see https://en.wikipedia.org/wiki/Local_regression. +template +class CLowess { +public: + using TDoubleDoublePr = std::pair; + using TDoubleDoublePrVec = std::vector; + using TPolynomial = CLeastSquaresOnlineRegression; + +public: + //! Fit a polynomial LOWESS model to \p data choosing the weight function to + //! maximize the likelihood of \p numberFolds hold out sets. + //! + //! \param[in] data The training data. + //! \param[in] numberFolds The number of folds to use in cross-validation to + // compute the best weight function from the family exp(-k |xi - xj|). + void fit(TDoubleDoublePrVec data, std::size_t numberFolds); + + //! Predict the value at \p x. + //! + //! \note Defined as zero if no data have been fit. + double predict(double x) const; + + //! Compute the minimum of the function on the training data interval. + //! + //! \note Defined as (0,0) if no data have been fit. + TDoubleDoublePr minimum() const; + + //! Get an estimate of residual variance at the observed values. + //! + //! \note Defined as zero if no data have been fit. + double residualVariance() const; + + //! Compute the sublevel set of \p f containing \p xmin. + //! + //! \param[in] xmin The argument of the minimum of the interpolated function. + //! \param[in] fmin The value of the minimum of the function. + //! \param[in] f The value of the function for which to compute the sublevel set. + //! \note \p f should be greater than fmin. + //! \note Defined as (0,0) if no data have been fit. + TDoubleDoublePr sublevelSet(double xmin, double fmin, double f) const; + + //! Get how far we are prepared to extrapolate as the interval we will search + //! in the minimum and sublevelSet functions. + TDoubleDoublePr extrapolationInterval() const; + +private: + using TDoubleVec = std::vector; + using TSizeVec = std::vector; + using TSizeVecVec = std::vector; + using TSizeVecCItr = TSizeVec::const_iterator; + using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; + +private: + void setupMasks(std::size_t numberFolds, TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks) const; + double likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks, double k) const; + TPolynomial fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const; + double weight(double k, double x1, double x2) const; + +private: + TDoubleDoublePrVec m_Data; + TSizeVec m_Mask; + double m_K = 0.0; +}; +} +} + +#endif // INCLUDED_ml_maths_CLowess_h diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h new file mode 100644 index 0000000000..1c6e5be6ed --- /dev/null +++ b/include/maths/CLowessDetail.h @@ -0,0 +1,337 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_maths_CLowessDetail_h +#define INCLUDED_ml_maths_CLowessDetail_h + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace ml { +namespace maths { + +template +void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { + + m_K = 0.0; + m_Data = std::move(data); + std::sort(m_Data.begin(), m_Data.end(), COrderings::SFirstLess{}); + + if (m_Data.size() < 4) { + return; + } + + // We use exponential decay in the weights and cross-validated maximum likelihood + // to choose the decay constant. Formally, we are fitting + // + // f(x | p^*) = poly(x | p^*(x)) + // + // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } + // + // where w = exp(-k (x - X_i)) and (X, Y) are the data to fit. We determine k by + // solving + // + // k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*)) } } + // + // where H is a hold out set and we assume Y_i ~ N(poly(X_i | p^*), sigma) with + // sigma estimated from the training data prediction residuals. + + m_Mask.resize(m_Data.size()); + std::iota(m_Mask.begin(), m_Mask.end(), 0); + + TSizeVecVec trainingMasks; + TSizeVecVec testingMasks; + this->setupMasks(numberFolds, trainingMasks, testingMasks); + + TDoubleVec K(17); + double range{m_Data.back().first - m_Data.front().first}; + for (std::size_t i = 0; i < K.size(); ++i) { + K[i] = 2.0 * static_cast(i) / range; + } + LOG_TRACE(<< "range = " << range << ", K = " << core::CContainerPrinter::print(K)); + + double kmax; + double likelihoodMax; + CSolvers::globalMaximize(K, + [&](double k) { + return this->likelihood(trainingMasks, testingMasks, k); + }, + kmax, likelihoodMax); + LOG_TRACE(<< "kmax = " << kmax << " likelihood(kmax) = " << likelihoodMax); + + m_K = kmax; +} + +template +double CLowess::predict(double x) const { + if (m_Data.empty()) { + return 0.0; + } + auto poly = this->fit(m_Mask.begin(), m_Mask.end(), m_K, x); + return poly.predict(x); +} + +template +typename CLowess::TDoubleDoublePr CLowess::minimum() const { + + if (m_Data.empty()) { + return {0.0, 0.0}; + } + + // There is no guaranty the function is convex so we need a global method. + // We choose something simple: + // 1. Find (local) minimum near a data point. + // 2. Search around here for the true local minimum. + // + // All in all this has complexity O(2 |data| function evaluations). + + TDoubleVec X; + + double xa, xb; + std::tie(xa, xb) = this->extrapolationInterval(); + + // Coarse. + X.reserve(m_Data.size() + 2); + X.push_back(xa); + for (std::size_t i = 0; i < m_Data.size(); ++i) { + X.push_back(m_Data[i].first); + } + X.push_back(xb); + double xmin, fmin; + CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xmin, fmin); + + // Refine. + double range{(xb - xa) / static_cast(X.size())}; + xa = std::max(xa, xmin - 0.5 * range); + xb = std::min(xb, xmin + 0.5 * range); + double dx{2.0 * (xb - xa) / static_cast(X.size())}; + X.clear(); + for (double x = xa; x < xb; x += dx) { + X.push_back(x); + } + double xcand, fcand; + CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xcand, fcand); + + if (fcand < fmin) { + xmin = xcand; + fmin = fcand; + } + + return {xmin, fmin}; +} + +template +double CLowess::residualVariance() const { + + if (m_Data.empty()) { + return 0.0; + } + + TMeanVarAccumulator moments; + + std::size_t n{m_Data.size()}; + + TSizeVec mask(n); + std::iota(mask.begin(), mask.end(), 1); + for (std::size_t i = 0; i < n; ++i) { + double xi, yi; + std::tie(xi, yi) = m_Data[i]; + auto poly = this->fit(mask.begin(), mask.begin() + n - 1, m_K, xi); + moments.add(yi - poly.predict(xi)); + mask[i] = i; + } + + return CBasicStatistics::variance(moments); +} + +template +typename CLowess::TDoubleDoublePr +CLowess::sublevelSet(double xmin, double fmin, double f) const { + + if (m_Data.empty()) { + return {0.0, 0.0}; + } + if (f <= fmin) { + return {xmin, xmin}; + } + + auto solve = [&](double n, double stop) { + double fx{fmin}; + for (double i = 1.0; i <= n; i += 1.0) { + double xlast{((i - 1.0) * stop + (n - i + 1.0) * xmin) / n}; + double x{(i * stop + (n - i) * xmin) / n}; + double flast{fx}; + fx = this->predict(x); + if (fx > f) { + return CTools::linearlyInterpolate(flast, fx, xlast, x, f); + } + } + return stop; + }; + + double xa, xb; + std::tie(xa, xb) = this->extrapolationInterval(); + double alpha{(xmin - xa) / (xb - xa)}; + double beta{1.0 - alpha}; + LOG_TRACE(<< "alpha = " << alpha << ", beta = " << beta); + + return {solve(std::ceil(alpha * 40.0), xa), + solve(std::ceil((1.0 - alpha) * 40.0), xb)}; +} + +template +typename CLowess::TDoubleDoublePr CLowess::extrapolationInterval() const { + double xa{m_Data.front().first}; + double xb{m_Data.back().first}; + xa -= std::min(0.1 * (xb - xa), 0.5 / m_K); + xb += std::min(0.1 * (xb - xa), 0.5 / m_K); + return {xa, xb}; +} + +template +void CLowess::setupMasks(std::size_t numberFolds, + TSizeVecVec& trainingMasks, + TSizeVecVec& testingMasks) const { + + numberFolds = CTools::truncate(numberFolds, std::size_t{2}, m_Data.size()); + + trainingMasks.resize(numberFolds); + testingMasks.resize(numberFolds); + + if (numberFolds == m_Data.size()) { + // Leave-out-one cross-validation. + trainingMasks[0].resize(m_Data.size() - 1); + std::iota(trainingMasks[0].begin(), trainingMasks[0].end(), 1); + testingMasks[0].push_back(0); + for (std::size_t i = 1; i < numberFolds; ++i) { + trainingMasks[i] = trainingMasks[0]; + trainingMasks[i][i - 1] = 0; + std::sort(trainingMasks[i].begin(), trainingMasks[i].end()); + testingMasks[i].push_back(i); + } + } else { + // K-fold cross-validation. + CPRNG::CXorOShiro128Plus rng; + TSizeVec all(m_Data.size()); + TSizeVec remaining; + TSizeVec sample; + TDoubleVec probabilities; + + std::iota(all.begin(), all.end(), 0); + remaining = all; + + for (std::size_t i = 0; i < numberFolds; ++i) { + std::size_t n{std::min((m_Data.size() + numberFolds - 1) / numberFolds, + remaining.size())}; + probabilities.assign(remaining.size(), 1.0); + CSampling::categoricalSampleWithoutReplacement(rng, probabilities, n, sample); + + testingMasks[i].reserve(sample.size()); + for (auto j : sample) { + testingMasks[i].push_back(remaining[j]); + } + std::sort(testingMasks[i].begin(), testingMasks[i].end()); + + trainingMasks[i].reserve(all.size() - testingMasks[i].size()); + std::set_difference(all.begin(), all.end(), testingMasks[i].begin(), + testingMasks[i].end(), + std::back_inserter(trainingMasks[i])); + + CSetTools::inplace_set_difference(remaining, testingMasks[i].begin(), + testingMasks[i].end()); + rng.discard(100000); + } + } + + LOG_TRACE(<< "training masks = " << core::CContainerPrinter::print(trainingMasks)); + LOG_TRACE(<< "testing masks = " << core::CContainerPrinter::print(testingMasks)); +} + +template +double CLowess::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMasks, double k) const { + + double result{0.0}; + + CNormalMeanPrecConjugate::TDouble1Vec samples; + CNormalMeanPrecConjugate::TDoubleWeightsAry1Vec weights; + + for (std::size_t i = 0; i < trainingMasks.size(); ++i) { + + CNormalMeanPrecConjugate residuals{ + CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData)}; + + std::size_t last{trainingMasks[i].size() - 1}; + + for (auto& j : trainingMasks[i]) { + double xj, yj; + std::tie(xj, yj) = m_Data[j]; + std::swap(j, trainingMasks[i][last]); + auto poly = this->fit(trainingMasks[i].cbegin(), + trainingMasks[i].cbegin() + last, k, xj); + std::swap(j, trainingMasks[i][last]); + residuals.addSamples({yj - poly.predict(xj)}, maths_t::CUnitWeights::SINGLE_UNIT); + } + LOG_TRACE(<< "residual distribution = " << residuals.print()); + + samples.clear(); + samples.reserve(testingMasks[i].size()); + for (auto j : testingMasks[i]) { + double xj, yj; + std::tie(xj, yj) = m_Data[j]; + auto poly = this->fit(trainingMasks[i].cbegin(), + trainingMasks[i].cend(), k, xj); + samples.push_back(yj - poly.predict(xj)); + } + weights.assign(testingMasks[i].size(), maths_t::CUnitWeights::UNIT); + LOG_TRACE(<< "samples = " << samples); + + double likelihood; + residuals.jointLogMarginalLikelihood(samples, weights, likelihood); + result += likelihood; + } + LOG_TRACE(<< "k = " << k << ", likelihood = " << result); + + return result; +} + +template +typename CLowess::TPolynomial +CLowess::fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const { + TPolynomial poly; + for (auto i = beginMask; i != endMask; ++i) { + double xi, yi; + std::tie(xi, yi) = m_Data[*i]; + poly.add(xi, yi, this->weight(k, xi, x)); + } + return poly; +} + +template +double CLowess::weight(double k, double x1, double x2) const { + return std::exp(-k * std::fabs(x2 - x1)); +} +} +} + +#endif // INCLUDED_ml_maths_CLowessDetail_h diff --git a/include/maths/CMixtureDistribution.h b/include/maths/CMixtureDistribution.h index 69ac0c4721..6a7e4d29b2 100644 --- a/include/maths/CMixtureDistribution.h +++ b/include/maths/CMixtureDistribution.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/maths/CSolvers.h b/include/maths/CSolvers.h index 8f1b5bba54..57ab149c88 100644 --- a/include/maths/CSolvers.h +++ b/include/maths/CSolvers.h @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -860,8 +859,8 @@ class MATHS_EXPORT CSolvers { //! \param[out] fx Set to the value of f at \p x. template static bool globalMaximize(const T& p, const F& f, double& x, double& fx) { - CCompositeFunctions::CMinus f_(f); - bool result = globalMinimize(p, f_, x, fx); + auto minusF = [&](double x_) { return -f(x_); }; + bool result{globalMinimize(p, minusF, x, fx)}; fx = -fx; return result; } @@ -923,7 +922,7 @@ class MATHS_EXPORT CSolvers { // [a, x] and [b, r] bracket the sublevel set end points. - CCompositeFunctions::CMinusConstant f_(f, fc); + auto fMinusFc = [=](double x_) { return f(x_) - fc; }; LOG_TRACE(<< "a = " << a << ", x = " << x << ", b = " << b); LOG_TRACE(<< "f_(a) = " << fa - fc << ", f_(x) = " << fx - fc @@ -935,7 +934,7 @@ class MATHS_EXPORT CSolvers { try { std::size_t n = maxIterations; - solve(a, x, fa - fc, fx - fc, f_, n, equal, result.first); + solve(a, x, fa - fc, fx - fc, fMinusFc, n, equal, result.first); LOG_TRACE(<< "iterations = " << n); } catch (const std::exception& e) { LOG_ERROR(<< "Failed to find left end point: " << e.what()); @@ -944,7 +943,7 @@ class MATHS_EXPORT CSolvers { try { std::size_t n = maxIterations; - solve(x, b, fx - fc, fb - fc, f_, n, equal, result.second); + solve(x, b, fx - fc, fb - fc, fMinusFc, n, equal, result.second); LOG_TRACE(<< "iterations = " << n); } catch (std::exception& e) { LOG_ERROR(<< "Failed to find right end point: " << e.what()); diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index c2477c3f4a..8037ff4f6e 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -18,8 +18,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -56,13 +56,11 @@ const double MIN_DOWNSAMPLE_FACTOR{1e-3}; const double MIN_INITIAL_DOWNSAMPLE_FACTOR{0.05}; const double MAX_INITIAL_DOWNSAMPLE_FACTOR{0.5}; const double MIN_DOWNSAMPLE_FACTOR_SCALE{0.3}; -const double MAX_DOWNSAMPLE_FACTOR_SCALE{3.0}; // This isn't a hard limit but we increase the number of default training folds // if the initial downsample fraction would be larger than this. const double MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION{0.5}; const double MAX_NUMBER_FOLDS{5.0}; const std::size_t MAX_NUMBER_TREES{static_cast(2.0 / MIN_ETA + 0.5)}; -const double EPS{0.01}; double computeEta(std::size_t numberRegressors) { // eta is the learning rate. There is a lot of empirical evidence that @@ -106,6 +104,9 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari ? this->skipProgressMonitoringFeatureSelection() : this->startProgressMonitoringFeatureSelection(); + // Find the maximum number of rows at which the selected tree depth does not change significantly. + // Need to call hyperparameter set up first. + skipIfAfter(CBoostedTreeImpl::E_NotInitialized, [&] { this->initializeCrossValidation(frame); }); skipIfAfter(CBoostedTreeImpl::E_NotInitialized, @@ -124,6 +125,8 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari this->initializeHyperparameterOptimisation(); } + LOG_INFO(<< "number threads = " << m_NumberThreads); + auto treeImpl = std::make_unique(m_NumberThreads, m_TreeImpl->m_Loss->clone()); std::swap(m_TreeImpl, treeImpl); @@ -551,7 +554,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa double minSoftDepthLimit{MIN_SOFT_DEPTH_LIMIT}; double maxSoftDepthLimit{MIN_SOFT_DEPTH_LIMIT + log2MaxTreeSize}; double meanSoftDepthLimit{(minSoftDepthLimit + maxSoftDepthLimit) / 2.0}; - double mainLoopSearchInterval{log2MaxTreeSize / 2.0}; LOG_TRACE(<< "mean soft depth limit = " << meanSoftDepthLimit); auto applySoftDepthLimit = [](CBoostedTreeImpl& tree, double softDepthLimit) { @@ -562,9 +564,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa TVector fallback{{minSoftDepthLimit, meanSoftDepthLimit, maxSoftDepthLimit}}; m_SoftDepthLimitSearchInterval = this->testLossLineSearch(frame, applySoftDepthLimit, - minSoftDepthLimit, maxSoftDepthLimit, - -mainLoopSearchInterval / 2.0, - mainLoopSearchInterval / 2.0) + minSoftDepthLimit, maxSoftDepthLimit) .value_or(fallback); m_SoftDepthLimitSearchInterval = max(m_SoftDepthLimitSearchInterval, TVector{1.0}); @@ -597,7 +597,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa logMaxDepthPenaltyMultiplier - CTools::stableLog(searchIntervalSize)}; double meanLogDepthPenaltyMultiplier{ (logMinDepthPenaltyMultiplier + logMaxDepthPenaltyMultiplier) / 2.0}; - double mainLoopSearchInterval{CTools::stableLog(searchIntervalSize) / 2.0}; LOG_TRACE(<< "mean log depth penalty multiplier = " << meanLogDepthPenaltyMultiplier); @@ -616,9 +615,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa m_LogDepthPenaltyMultiplierSearchInterval = this->testLossLineSearch(frame, applyDepthPenaltyMultiplier, logMinDepthPenaltyMultiplier, - logMaxDepthPenaltyMultiplier, - -mainLoopSearchInterval / 2.0, - mainLoopSearchInterval / 2.0) + logMaxDepthPenaltyMultiplier) .value_or(fallback); LOG_TRACE(<< "log depth penalty multiplier search interval = [" << m_LogDepthPenaltyMultiplierSearchInterval.toDelimited() @@ -651,7 +648,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa logMaxTreeSizePenaltyMultiplier - CTools::stableLog(searchIntervalSize)}; double meanLogTreeSizePenaltyMultiplier{ (logMinTreeSizePenaltyMultiplier + logMaxTreeSizePenaltyMultiplier) / 2.0}; - double mainLoopSearchInterval{0.5 * CTools::stableLog(searchIntervalSize)}; LOG_TRACE(<< "mean log tree size penalty multiplier = " << meanLogTreeSizePenaltyMultiplier); @@ -670,9 +666,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa m_LogTreeSizePenaltyMultiplierSearchInterval = this->testLossLineSearch(frame, applyTreeSizePenaltyMultiplier, logMinTreeSizePenaltyMultiplier, - logMaxTreeSizePenaltyMultiplier, - -mainLoopSearchInterval / 2.0, - mainLoopSearchInterval / 2.0) + logMaxTreeSizePenaltyMultiplier) .value_or(fallback); LOG_TRACE(<< "log tree size penalty multiplier search interval = [" << m_LogTreeSizePenaltyMultiplierSearchInterval.toDelimited() @@ -706,7 +700,6 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa CTools::stableLog(searchIntervalSize)}; double meanLogLeafWeightPenaltyMultiplier{ (logMinLeafWeightPenaltyMultiplier + logMaxLeafWeightPenaltyMultiplier) / 2.0}; - double mainLoopSearchInterval{0.5 * CTools::stableLog(searchIntervalSize)}; LOG_TRACE(<< "mean log leaf weight penalty multiplier = " << meanLogLeafWeightPenaltyMultiplier); @@ -725,9 +718,7 @@ void CBoostedTreeFactory::initializeUnsetRegularizationHyperparameters(core::CDa m_LogLeafWeightPenaltyMultiplierSearchInterval = this->testLossLineSearch(frame, applyLeafWeightPenaltyMultiplier, logMinLeafWeightPenaltyMultiplier, - logMaxLeafWeightPenaltyMultiplier, - -mainLoopSearchInterval / 2.0, - mainLoopSearchInterval / 2.0) + logMaxLeafWeightPenaltyMultiplier) .value_or(fallback); LOG_TRACE(<< "log leaf weight penalty multiplier search interval = [" << m_LogLeafWeightPenaltyMultiplierSearchInterval.toDelimited() @@ -818,7 +809,7 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram double minTestLoss, double testLoss) { return testLoss + CTools::linearlyInterpolate( logMinDownsampleFactor, logMaxDownsampleFactor, - 0.0, EPS * minTestLoss, logDownsampleFactor); + 0.0, 0.01 * minTestLoss, logDownsampleFactor); }; TVector fallback; @@ -827,11 +818,9 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram fallback(MAX_PARAMETER_INDEX) = logMaxDownsampleFactor; m_LogDownsampleFactorSearchInterval = - this->testLossLineSearch( - frame, applyDownsampleFactor, - logMinDownsampleFactor, logMaxDownsampleFactor, - CTools::stableLog(MIN_DOWNSAMPLE_FACTOR_SCALE), - CTools::stableLog(MAX_DOWNSAMPLE_FACTOR_SCALE), adjustTestLoss) + this->testLossLineSearch(frame, applyDownsampleFactor, + logMinDownsampleFactor, + logMaxDownsampleFactor, adjustTestLoss) .value_or(fallback); // Truncate the log(factor) to be less than or equal to log(1.0) and the @@ -870,7 +859,6 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr 2.0 * m_TreeImpl->m_FeatureBagFraction, MAX_FEATURE_BAG_FRACTION))}; double logMinFeatureBagFraction{logMaxFeatureBagFraction - CTools::stableLog(searchIntervalSize)}; - double mainLoopSearchInterval{CTools::stableLog(0.2 * searchIntervalSize)}; auto applyFeatureBagFraction = [&](CBoostedTreeImpl& tree, double logFeatureBagFraction) { @@ -888,7 +876,7 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr double minTestLoss, double testLoss) { return testLoss + CTools::linearlyInterpolate( logMinFeatureBagFraction, logMaxFeatureBagFraction, - 0.0, EPS * minTestLoss, logFeatureBagFraction); + 0.0, 0.01 * minTestLoss, logFeatureBagFraction); }; TVector fallback; @@ -896,10 +884,9 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr fallback(BEST_PARAMETER_INDEX) = logMaxFeatureBagFraction; fallback(MAX_PARAMETER_INDEX) = logMaxFeatureBagFraction; m_LogFeatureBagFractionInterval = - this->testLossLineSearch( - frame, applyFeatureBagFraction, logMinFeatureBagFraction, - logMaxFeatureBagFraction, -mainLoopSearchInterval / 2.0, - mainLoopSearchInterval / 2.0, adjustTestLoss) + this->testLossLineSearch(frame, applyFeatureBagFraction, + logMinFeatureBagFraction, + logMaxFeatureBagFraction, adjustTestLoss) .value_or(fallback); // Truncate the log(fraction) to be less than or equal to log(MAX_FEATURE_BAG_FRACTION). @@ -931,7 +918,6 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) { m_TreeImpl->m_Eta)}; double logMinEta{logMaxEta - CTools::stableLog(searchIntervalSize)}; double meanLogEta{(logMaxEta + logMinEta) / 2.0}; - double mainLoopSearchInterval{CTools::stableLog(0.2 * searchIntervalSize)}; LOG_TRACE(<< "mean log eta = " << meanLogEta); auto applyEta = [](CBoostedTreeImpl& tree, double eta) { @@ -951,9 +937,7 @@ void CBoostedTreeFactory::initializeUnsetEta(core::CDataFrame& frame) { fallback(MAX_PARAMETER_INDEX) = logMaxEta; m_LogEtaSearchInterval = - this->testLossLineSearch(frame, applyEta, logMinEta, logMaxEta, - -mainLoopSearchInterval / 2.0, - mainLoopSearchInterval / 2.0) + this->testLossLineSearch(frame, applyEta, logMinEta, logMaxEta) .value_or(fallback); m_LogEtaSearchInterval = min(m_LogEtaSearchInterval, TVector{0.0}); LOG_TRACE(<< "log eta search interval = [" @@ -1006,8 +990,6 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, const TApplyParameter& applyParameter, double intervalLeftEnd, double intervalRightEnd, - double returnedIntervalLeftEndOffset, - double returnedIntervalRightEndOffset, const TAdjustTestLoss& adjustTestLoss_) const { // This has the following steps: @@ -1024,14 +1006,13 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, // the returned interval if we can determine there is a low chance of // missing the best solution by doing so. - using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar::TAccumulator; using TMinAccumulator = CBasicStatistics::SMin::TAccumulator; TMinAccumulator minTestLoss; TDoubleDoublePrVec testLosses; testLosses.reserve(MAX_LINE_SEARCH_ITERATIONS); // Ensure we choose one value based on expected improvement. - std::size_t minNumberTestLosses{5}; + std::size_t minNumberTestLosses{6}; for (auto parameter : {intervalLeftEnd, (2.0 * intervalLeftEnd + intervalRightEnd) / 3.0, @@ -1101,97 +1082,27 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, } std::sort(testLosses.begin(), testLosses.end()); - LOG_TRACE(<< "test losses = " << core::CContainerPrinter::print(testLosses)); - - // Find the smallest test losses and the corresponding parameter interval. - auto minimumTestLosses = CBasicStatistics::orderStatisticsAccumulator( - minNumberTestLosses - 1, COrderings::SSecondLess{}); - minimumTestLosses.add(testLosses); - double minGoodParameter{std::min_element(minimumTestLosses.begin(), - minimumTestLosses.end(), COrderings::SFirstLess{}) - ->first}; - double maxGoodParameter{std::max_element(minimumTestLosses.begin(), - minimumTestLosses.end(), COrderings::SFirstLess{}) - ->first}; - auto beginGoodParameterLosses = - std::find_if(testLosses.begin(), testLosses.end(), - [minGoodParameter](const TDoubleDoublePr& loss) { - return loss.first == minGoodParameter; - }); - auto endGoodParameterLosses = - std::find_if(testLosses.begin(), testLosses.end(), - [maxGoodParameter](const TDoubleDoublePr& loss) { - return loss.first == maxGoodParameter; - }) + - 1; - LOG_TRACE(<< "good parameter range = [" << minGoodParameter << "," - << maxGoodParameter << "]"); - - CLeastSquaresOnlineRegression<2, double> leastSquaresQuadraticTestLoss; - for (auto loss = beginGoodParameterLosses; loss != endGoodParameterLosses; ++loss) { - leastSquaresQuadraticTestLoss.add(loss->first, loss->second); - } - CLeastSquaresOnlineRegression<2, double>::TArray params; - if (leastSquaresQuadraticTestLoss.parameters(params) == false) { - return TOptionalVector{}; - } - - double gradient{params[1]}; - double curvature{params[2]}; - LOG_TRACE(<< "[intercept, slope, curvature] = " - << core::CContainerPrinter::print(params)); - - // Find the minimizer of the least squares quadratic fit to the test loss - // in the search interval. (Note step size is negative.) - double stationaryPoint{-(gradient == curvature ? 0.5 : gradient / 2.0 / curvature)}; - double bestParameter{[&] { - if (curvature < 0.0) { - // Stationary point is a maximum so use furthest point in interval. - double distanceToLeftEndpoint{std::fabs(minGoodParameter - stationaryPoint)}; - double distanceToRightEndpoint{std::fabs(maxGoodParameter - stationaryPoint)}; - return distanceToLeftEndpoint > distanceToRightEndpoint ? minGoodParameter - : maxGoodParameter; - } - // Stationary point is a minimum so use nearest point in the interval. - return CTools::truncate(stationaryPoint, minGoodParameter, maxGoodParameter); - }()}; - LOG_TRACE(<< "best parameter = " << bestParameter); - - TVector interval{{returnedIntervalLeftEndOffset, 0.0, returnedIntervalRightEndOffset}}; - if (minGoodParameter > intervalLeftEnd) { - interval(MIN_PARAMETER_INDEX) = std::max(minGoodParameter - bestParameter, - interval(MIN_PARAMETER_INDEX)); - } - if (maxGoodParameter < intervalRightEnd) { - interval(MAX_PARAMETER_INDEX) = std::min(maxGoodParameter - bestParameter, - interval(MAX_PARAMETER_INDEX)); - } - if (curvature > 0.0) { - // Find a short interval with a high probability of containing the optimal - // regularisation parameter if we found a minimum. In particular, we solve - // curvature * (x - best)^2 = 3 sigma where sigma is the standard deviation - // of the test loss residuals to get the interval endpoints. We don't - // extrapolate the loss function outside the line segment we searched so - // don't truncate if an endpoint lies outside the searched interval. - TMeanVarAccumulator residualMoments; - for (auto loss = beginGoodParameterLosses; loss != endGoodParameterLosses; ++loss) { - residualMoments.add(loss->second - - leastSquaresQuadraticTestLoss.predict(loss->first)); - } - double sigma{std::sqrt(CBasicStatistics::variance(residualMoments))}; - double threeSigmaInterval{std::sqrt(3.0 * sigma / curvature)}; - if (bestParameter - threeSigmaInterval >= minGoodParameter) { - interval(MIN_PARAMETER_INDEX) = - std::max(-threeSigmaInterval, returnedIntervalLeftEndOffset); - } - if (bestParameter + threeSigmaInterval <= maxGoodParameter) { - interval(MAX_PARAMETER_INDEX) = - std::min(threeSigmaInterval, returnedIntervalRightEndOffset); - } - } - interval += TVector{bestParameter}; - - return TOptionalVector{interval}; + LOG_INFO(<< "test losses = " << core::CContainerPrinter::print(testLosses)); + + CLowess<2> lowess; + lowess.fit(std::move(testLosses), testLosses.size()); + + double bestParameter, bestParameterTestLoss; + std::tie(bestParameter, bestParameterTestLoss) = lowess.minimum(); + LOG_INFO(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss); + + double width{(intervalRightEnd - intervalLeftEnd) / static_cast(MAX_LINE_SEARCH_ITERATIONS)}; + intervalLeftEnd = bestParameter - width; + intervalRightEnd = bestParameter + width; + LOG_INFO(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]"); + //double residualVariance{lowess.residualVariance()}; + //std::tie(intervalLeftEnd, intervalRightEnd) = + // lowess.sublevelSet(bestParameter, bestParameterTestLoss, + // bestParameterTestLoss + std::sqrt(residualVariance)); + //LOG_INFO(<< "residual variance = " << residualVariance << " interval = [" + // << intervalLeftEnd << "," << intervalRightEnd << "]"); + + return TVector{{intervalLeftEnd, bestParameter, intervalRightEnd}}; } CBoostedTreeFactory CBoostedTreeFactory::constructFromParameters(std::size_t numberThreads, diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc new file mode 100644 index 0000000000..cd3073949b --- /dev/null +++ b/lib/maths/unittest/CLowessTest.cc @@ -0,0 +1,262 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include + +BOOST_AUTO_TEST_SUITE(CLowessTest) + +using namespace ml; + +using TDoubleVec = std::vector; +using TDoubleVecVec = std::vector; +using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; + +BOOST_AUTO_TEST_CASE(testInvariants) { + + // Test invariants are satisfied on random input. + + // We check: + // 1. Minimum is a local minimum. + // 2. The sublevel set contains the minimum. + // 3. The minimum is within 10% of the training data interval. + // 4. The ends of the sublevel set is within 10% of the training data interval. + // 5. The variance is greater than or equal to the variance of the residuals at + // the training data. + + test::CRandomNumbers rng; + + TDoubleVec scale; + TDoubleVec offset; + TDoubleVec noise; + maths::CLowess<2>::TDoubleDoublePrVec data; + + std::function trends[]{ + [&](double x) { + return scale[0] * std::sin(boost::math::double_constants::two_pi / + 20.0 * (x + offset[0])); + }, + [&](double x) { return scale[0] * x / 10.0; }, + [&](double x) { + return scale[0] * (x - offset[0]) * (x - offset[0]) / 100.0; + } + }; + + for (std::size_t i = 0; i < 100; ++i) { + + for (const auto& trend : trends) { + rng.generateUniformSamples(0.0, 10.0, 1, scale); + rng.generateUniformSamples(0.0, 20.0, 1, offset); + rng.generateNormalSamples(0.0, 4.0, 20, noise); + + data.clear(); + for (std::size_t j = 0; j < noise.size(); ++j) { + double x{static_cast(j)}; + data.emplace_back(x, trend(x) + noise[j]); + } + + maths::CLowess<2> lowess; + lowess.fit(data, 5); + + double xea, xeb; + std::tie(xea, xeb) = lowess.extrapolationInterval(); + + double xmin, fmin; + std::tie(xmin, fmin) = lowess.minimum(); + BOOST_REQUIRE_EQUAL(fmin, lowess.predict(xmin)); + BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea))); + BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::min(xmin + 0.1, xeb))); + + double xa, xb; + std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, fmin + 0.1); + BOOST_TEST_REQUIRE(xa <= xmin); + BOOST_TEST_REQUIRE(xb >= xmin); + + BOOST_TEST_REQUIRE(xmin >= xea); + BOOST_TEST_REQUIRE(xmin <= xeb); + BOOST_TEST_REQUIRE(xa >= xea); + BOOST_TEST_REQUIRE(xb <= xeb); + BOOST_TEST_REQUIRE(xa >= xea); + BOOST_TEST_REQUIRE(xb <= xeb); + + TMeanVarAccumulator residualMoments; + for (const auto& x : data) { + residualMoments.add(x.second - lowess.predict(x.first)); + } + BOOST_TEST_REQUIRE(maths::CBasicStatistics::variance(residualMoments) < + lowess.residualVariance()); + } + } +} + +BOOST_AUTO_TEST_CASE(testSmooth) { + + // Test the prediction errors on a smooth function. + + test::CRandomNumbers rng; + + auto trend = [](double x) { + return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); + }; + + maths::CLowess<2>::TDoubleDoublePrVec data; + for (std::size_t i = 0; i < 20; ++i) { + double x{static_cast(i)}; + data.emplace_back(x, trend(x)); + } + + maths::CLowess<2> lowess; + lowess.fit(data, 5); + + TMeanVarAccumulator errorMoments; + for (std::size_t i = 0; i < 20; ++i) { + double x{static_cast(i)}; + errorMoments.add(std::fabs(lowess.predict(x) - trend(x))); + } + LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(errorMoments)); + + BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(errorMoments) < 0.1); +} + +BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) { + + // Test the prediction errors on a smooth function plus noise. + + test::CRandomNumbers rng; + + TDoubleVec noise; + rng.generateNormalSamples(0.0, 4.0, 20, noise); + + auto trend = [](double x) { + return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); + }; + + maths::CLowess<2>::TDoubleDoublePrVec data; + for (std::size_t i = 0; i < noise.size(); ++i) { + double x{static_cast(i)}; + data.emplace_back(x, trend(x) + noise[i]); + } + + maths::CLowess<2> lowess; + lowess.fit(data, 5); + + TMeanVarAccumulator errorMoments; + for (std::size_t i = 0; i < 20; ++i) { + double x{static_cast(i)}; + errorMoments.add(std::fabs(lowess.predict(x) - trend(x))); + } + LOG_DEBUG(<< "mean error = " << maths::CBasicStatistics::mean(errorMoments)); + + BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(errorMoments) < 0.8); + BOOST_TEST_REQUIRE(std::fabs(std::sqrt(lowess.residualVariance()) - 2.0) < 0.6); +} + +BOOST_AUTO_TEST_CASE(testMinimum) { + + // Check that the minimum and the predicted value at the minimum is close to + // what we'd expect. + + test::CRandomNumbers rng; + + auto trend = [](double x) { + return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); + }; + + maths::CLowess<2>::TDoubleDoublePrVec data; + for (std::size_t i = 0; i < 20; ++i) { + double x{static_cast(i)}; + data.emplace_back(x, trend(x)); + } + + maths::CLowess<2> lowess; + lowess.fit(data, 5); + + double x, fx; + std::tie(x, fx) = lowess.minimum(); + + // Expect minimum at ((3 / 2) * pi) / (2 pi / 20) = 15 and a value of around -8.0; + + LOG_DEBUG(<< "xmin = " << x << ", f(xmin) = " << fx); + BOOST_REQUIRE_CLOSE(15.0, x, 1.0); // 1% + BOOST_REQUIRE_CLOSE(-8.0, fx, 5.0); // 5% +} + +BOOST_AUTO_TEST_CASE(testTrainingLossCurves) { + + // Test minimization of some training loss curves from boosted tree hyperparameter + // line searches for: + // 1. Miniboone + // 2. Car-parts + // 3. Boston + + using TDoubleDoublePrVecVec = std::vector::TDoubleDoublePrVec>; + + // clang-format off + TDoubleDoublePrVecVec curves{ + {{2.0, 0.1767327}, {6.080264, 0.1659147}, {9.615924, 0.1607294}, {10.16053, 0.1614871}, {14.24079, 0.1633198}}, + {{-2.561376, 0.1672884}, {-1.085517, 0.1647196}, {0.3903422, 0.1639279}, {1.474411, 0.1662013}, {1.866201, 0.1628465}}, + {{-2.561376, 0.162188}, {-1.085517, 0.1600827}, {-0.5958557, 0.1598617}, {0.3903422, 0.1642588}, {1.866201, 0.1778405}}, + {{-1.600108, 0.1588888}, {0.342874, 0.1574784}, {2.285856, 0.1569175}, {3.825301, 0.1527161}, {4.228838, 0.1555854}}, + {{-4.969813, 0.5935475}, {-3.313209, 0.2387051}, {-1.656604, 0.1552702}, {-0.7187975, 0.1507938}, {0, 0.1494794}}, + {{-2.302585, 0.1651654}, {-1.609438, 0.1712131}, {-0.9162907, 0.1550724}, {-0.4452244, 0.1491943}, {-0.2231436, 0.1489314}}, + {{2.0, 0.01361971}, {5.811543, 0.002268836}, {6.648845, 0.001762906}, {6.731061, 0.001930386}, {8.76648, 0.001210521}, + {9.58383, 0.002405683}, {9.623085, 0.002132054}, {9.787585, 0.002502508}, {10.42778, 0.001915853}, {13.43463, 0.001321818}}, + {{1.71972, 0.003296972}, {3.890569, 0.002917327}, {3.939936, 0.00103488}, {3.97139, 0.003646344}, {6.022504, 0.002943863}, + {6.061419, 0.001830975}, {7.801588, 0.003221994}, {7.930129, 0.003912988}, {8.232269, 0.004673212}}, + {{1.71972, 0.003408918}, {2.043608, 0.003519984}, {3.890569, 0.01988785}, {6.061419, 0.0764257}, {8.232269, 0.1406254}}, + {{-0.05942321, 0.003394985}, {0.6689442, 0.003665651}, {1.924394, 0.004942474}, {3.908212, 0.006659611}, {5.892029, 0.0157031}}, + {{-4.969813, 0.1798482}, {-3.313209, 0.1798566}, {-1.656604, 0.01256459}, {-1.154333, 0.004852421}, {-0.8191196, 0.003527397}, + {-0.2381196, 0.001983409}, {0, 0.002551422}}, + {{-2.302585, 0.001822712}, {-1.609438, 0.00345773}, {-0.9162907, 0.003139631}, {-0.2855592, 0.003175851}, {-0.2231436, 0.002630656}}, + {{-3.800451, 0.002890249}, {-2.801874, 0.002432233}, {-2.446324, 0.002333384}, {-2.291018, 0.001627785}, {-2.190441, 0.001669799}, + {-1.999605, 0.002137923}, {-1.803296, 0.001832592}, {-1.628174, 0.003295475}, {-0.8946376, 0.001722856}, {-0.804719, 0.001301327}}, + {{2.0, 10.71672}, {4.827566, 9.507881}, {4.830618, 8.36871}, {7.661235, 9.822492}, {10.49185, 10.09627}}, + {{-5.991457, 9.803939}, {-2.538955, 9.975635}, {0.9135475, 9.298096}, {3.543894, 8.223675}, {4.36605, 8.962077}}, + {{-5.991457, 9.35017}, {-3.357034, 9.962562}, {-2.538955, 9.027685}, {-1.97598, 8.668243}, {0.9135475, 10.19129}, {4.36605, 11.89721}}, + {{0.6931472, 9.422628}, {1.610698, 9.089348}, {1.691725, 8.93955}, {2.158699, 10.18192}, {2.545694, 9.212234}, {2.690302, 9.148424}, + {2.943044, 10.4056}, {3.688879, 11.13337}}, + {{-1.279388, 11.9904}, {-0.8885609, 9.800607}, {-0.6476757, 8.581057}, {-0.5692195, 7.907454}, {-0.4977335, 8.514873}, + {-0.1069061, 9.885219}}, + {{-3.800451, 8.317797}, {-3.738576, 8.053429}, {-3.403612, 8.338234}, {-2.801874, 8.890816}, {-2.333564, 8.705093}, + {-2.208987, 10.69139}, {-1.803296, 9.234116}, {-1.002829, 10.67219}, {-0.9090844, 12.46085}, {-0.804719, 13.98731}}}; + // clang-format on + + for (const auto& curve : curves) { + maths::CLowess<2> lowess; + lowess.fit(curve, curve.size()); + double xmin, fmin; + std::tie(xmin, fmin) = lowess.minimum(); + double variance{lowess.residualVariance()}; + + double xa, xb; + double ftarget{fmin + std::sqrt(variance)}; + std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, ftarget); + + if (xa <= curve.front().first) { + BOOST_TEST_REQUIRE(lowess.predict(xa) <= 1.01 * ftarget); + } else { + BOOST_REQUIRE_CLOSE(lowess.predict(xa), ftarget, 1.0); // 1.0% + } + if (xb >= curve.back().first) { + BOOST_TEST_REQUIRE(lowess.predict(xb) <= 1.01 * ftarget); + } else { + BOOST_REQUIRE_CLOSE(lowess.predict(xb), ftarget, 1.0); // 1.0% + } + } +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/maths/unittest/COneOfNPriorTest.cc b/lib/maths/unittest/COneOfNPriorTest.cc index 7cfedbfe4d..8be97e715e 100644 --- a/lib/maths/unittest/COneOfNPriorTest.cc +++ b/lib/maths/unittest/COneOfNPriorTest.cc @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/lib/maths/unittest/CSolversTest.cc b/lib/maths/unittest/CSolversTest.cc index 671376bf45..058a66897c 100644 --- a/lib/maths/unittest/CSolversTest.cc +++ b/lib/maths/unittest/CSolversTest.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index 0ff2bd9b53..4209ad3eb7 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -58,6 +58,7 @@ SRCS=\ CLinearAlgebraTest.cc \ CLogNormalMeanPrecConjugateTest.cc \ CLogTDistributionTest.cc \ + CLowessTest.cc \ CMathsFuncsTest.cc \ CMathsMemoryTest.cc \ CMicTest.cc \ diff --git a/lib/maths/unittest/TestUtils.cc b/lib/maths/unittest/TestUtils.cc index 43394d1d32..6b4036dfea 100644 --- a/lib/maths/unittest/TestUtils.cc +++ b/lib/maths/unittest/TestUtils.cc @@ -6,6 +6,7 @@ #include "TestUtils.h" +#include #include #include #include From b2af7143daf445ea0aa3166f6a92b24f9df37053 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Tue, 29 Jun 2021 13:07:14 +0100 Subject: [PATCH 02/35] Restrict the maximum number of rows used during hyperparameter tuning to avoid runtime blowup --- include/maths/CBoostedTreeFactory.h | 3 ++ include/maths/CBoostedTreeImpl.h | 5 ++- include/maths/CDataFrameUtils.h | 10 +++-- lib/maths/CBoostedTreeFactory.cc | 49 ++++++++++++++------ lib/maths/CBoostedTreeImpl.cc | 42 +++++++++-------- lib/maths/CDataFrameUtils.cc | 70 ++++++++++++++++------------- 6 files changed, 110 insertions(+), 69 deletions(-) diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 02564e594f..74eeae688f 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -80,6 +80,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency); //! Set the number of folds to use for estimating the generalisation error. CBoostedTreeFactory& numberFolds(std::size_t numberFolds); + //! Set the maximum number of rows to use for training when tuning hyperparameters. + CBoostedTreeFactory& maximumNumberTrainRows(std::size_t rows); //! Stratify the cross-validation we do for regression. CBoostedTreeFactory& stratifyRegressionCrossValidation(bool stratify); //! Stop cross-validation early if the test loss is not promising. @@ -275,6 +277,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { TOptionalSize m_BayesianOptimisationRestarts; bool m_StratifyRegressionCrossValidation = true; double m_InitialDownsampleRowsPerFeature = 200.0; + std::size_t m_MaximumNumberOfTrainRows = 1000000; double m_GainPerNode1stPercentile = 0.0; double m_GainPerNode50thPercentile = 0.0; double m_GainPerNode90thPercentile = 0.0; diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index fad8343531..ad19e896c7 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -320,6 +320,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Check invariants which are assumed to hold in order to train on \p frame. void checkTrainInvariants(const core::CDataFrame& frame) const; + //! Get the count of train/validation folds. + std::size_t numberFolds() const; + //! Get the number of hyperparameters to tune. std::size_t numberHyperparametersToTune() const; @@ -380,7 +383,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { double m_DownsampleFactor = 0.5; double m_Eta = 0.1; double m_EtaGrowthRatePerTree = 1.05; - std::size_t m_NumberFolds = 4; + double m_FractionalFolds = 4.0; std::size_t m_MaximumNumberTrees = 20; std::size_t m_MaximumAttemptsToAddTree = 3; std::size_t m_NumberSplitsPerFeature = 75; diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h index 581cd74f30..411a680455 100644 --- a/include/maths/CDataFrameUtils.h +++ b/include/maths/CDataFrameUtils.h @@ -280,9 +280,11 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { //! \param[in] frame The data frame for which to compute the row masks. //! \param[in] targetColumn The index of the column to predict. //! \param[in] rng The random number generator to use. - //! \param[in] numberFolds The number of folds to use. - //! \param[in] numberBuckets The number of buckets to use when stratifying by - //! target quantiles for regression. + //! \param[in] numberFolds The number of folds to use. If this is less than + //! two, there will be two train masks, but their size will be less than 50% + //! of the data. + //! \param[in] numberBuckets The number of buckets to use when stratifying + //! by target quantiles for regression. //! \param[in] allTrainingRowsMask A mask of the candidate training rows. //! \warning This fails if the target is not categorical. static std::tuple @@ -290,7 +292,7 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { const core::CDataFrame& frame, std::size_t targetColumn, CPRNG::CXorOShiro128Plus rng, - std::size_t numberFolds, + double numberFolds, std::size_t numberBuckets, const core::CPackedBitVector& allTrainingRowsMask); diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 1d1a6305b5..8218489619 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -125,8 +125,6 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari this->initializeHyperparameterOptimisation(); } - LOG_INFO(<< "number threads = " << m_NumberThreads); - auto treeImpl = std::make_unique(m_NumberThreads, m_TreeImpl->m_Loss->clone()); std::swap(m_TreeImpl, treeImpl); @@ -334,18 +332,35 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { // to find the smallest integer k s.t. c * f * # rows <= (1 - 1 / k) * # rows. // This gives k = ceil(1 / (1 - c * f)). However, we also upper bound this // by MAX_NUMBER_FOLDS. + // + // In addition, we want to constrain the maximum amount of training data we'll + // use during hyperparameter search to avoid very long run times. To do this + // we set the number of folds to be less than two. We define the size of the + // training data set to be (k - 1) / k * # rows, with k the number of folds. + // If k < 2 this means we end up selecting less than half the data for training. + // To meet the constraint on the maximum number of rows M we must choose k + // which satisfies M >= (k - 1) / k * # rows. This is trivially satisfied for + // # rows less than M and, given we also constrain the maximum number of folds, + // we only care if # rows > MAX_NUMBER_FOLDS * M / (MAX_NUMBER_FOLDS - 1). double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature * static_cast(frame.numberColumns() - 1)) / static_cast(totalNumberTrainingRows)}; - - m_TreeImpl->m_NumberFolds = static_cast( + double minimumTrainingDataConstraintNumberFolds{ std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION, - 1.0 / MAX_NUMBER_FOLDS))); + 1.0 / MAX_NUMBER_FOLDS))}; + double maximumTrainingDataConstraintNumberFolds{ + 1.0 / (1.0 - static_cast(m_MaximumNumberOfTrainRows) / + std::max(static_cast(frame.numberRows()), + MAX_NUMBER_FOLDS / (MAX_NUMBER_FOLDS - 1.0) * + static_cast(m_MaximumNumberOfTrainRows)))}; + + m_TreeImpl->m_FractionalFolds = std::min(minimumTrainingDataConstraintNumberFolds, + maximumTrainingDataConstraintNumberFolds); LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction - << " # folds = " << m_TreeImpl->m_NumberFolds); + << " # folds = " << m_TreeImpl->m_FractionalFolds); } else { - m_TreeImpl->m_NumberFolds = *m_TreeImpl->m_NumberFoldsOverride; + m_TreeImpl->m_FractionalFolds = static_cast(*m_TreeImpl->m_NumberFoldsOverride); } } @@ -378,7 +393,7 @@ void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) con std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks, std::ignore) = CDataFrameUtils::stratifiedCrossValidationRowMasks( m_TreeImpl->m_NumberThreads, frame, dependentVariable, m_TreeImpl->m_Rng, - m_TreeImpl->m_NumberFolds, numberBuckets, allTrainingRowsMask); + m_TreeImpl->m_FractionalFolds, numberBuckets, allTrainingRowsMask); } void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFrame& frame) const { @@ -813,8 +828,7 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram fallback(MAX_PARAMETER_INDEX) = logMaxDownsampleFactor; m_LogDownsampleFactorSearchInterval = - this->testLossLineSearch(frame, applyDownsampleFactor, - logMinDownsampleFactor, + this->testLossLineSearch(frame, applyDownsampleFactor, logMinDownsampleFactor, logMaxDownsampleFactor, adjustTestLoss) .value_or(fallback); @@ -869,9 +883,10 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr // larger than the minimum. auto adjustTestLoss = [=](double logFeatureBagFraction, double minTestLoss, double testLoss) { - return testLoss + CTools::linearlyInterpolate( - logMinFeatureBagFraction, logMaxFeatureBagFraction, - 0.0, 0.01 * minTestLoss, logFeatureBagFraction); + return testLoss + + CTools::linearlyInterpolate( + logMinFeatureBagFraction, logMaxFeatureBagFraction, + 0.0, 0.01 * minTestLoss, logFeatureBagFraction); }; TVector fallback; @@ -1086,7 +1101,8 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, std::tie(bestParameter, bestParameterTestLoss) = lowess.minimum(); LOG_INFO(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss); - double width{(intervalRightEnd - intervalLeftEnd) / static_cast(MAX_LINE_SEARCH_ITERATIONS)}; + double width{(intervalRightEnd - intervalLeftEnd) / + static_cast(MAX_LINE_SEARCH_ITERATIONS)}; intervalLeftEnd = bestParameter - width; intervalRightEnd = bestParameter + width; LOG_INFO(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]"); @@ -1164,6 +1180,11 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) { return *this; } +CBoostedTreeFactory& CBoostedTreeFactory::maximumNumberTrainRows(std::size_t rows) { + m_MaximumNumberOfTrainRows = rows; + return *this; +} + CBoostedTreeFactory& CBoostedTreeFactory::stratifyRegressionCrossValidation(bool stratify) { m_StratifyRegressionCrossValidation = stratify; return *this; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index fe133c6a11..07d9b27ec5 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -334,7 +334,7 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, m_MaximumNumberTrees * (sizeof(TNodeVec) + maximumNumberNodes * CBoostedTreeNode::estimateMemoryUsage( m_Loss->numberParameters()))}; - std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds * + std::size_t foldRoundLossMemoryUsage{this->numberFolds() * m_NumberRounds * sizeof(TOptionalDouble)}; std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)}; std::size_t tunableHyperparametersMemoryUsage{ @@ -367,8 +367,7 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, // we get a constant 8 / 64. std::size_t missingFeatureMaskMemoryUsage{8 * numberColumns * numberRows / 64}; std::size_t trainTestMaskMemoryUsage{ - 2 * static_cast(std::ceil(std::log2(static_cast(m_NumberFolds)))) * - numberRows}; + 2 * static_cast(std::ceil(std::log2(m_FractionalFolds))) * numberRows}; std::size_t bayesianOptimisationMemoryUsage{CBayesianOptimisation::estimateMemoryUsage( this->numberHyperparametersToTune(), m_NumberRounds)}; std::size_t worstCaseMemoryUsage{ @@ -436,7 +435,7 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile, } void CBoostedTreeImpl::initializePerFoldTestLosses() { - m_FoldRoundTestLosses.resize(m_NumberFolds); + m_FoldRoundTestLosses.resize(this->numberFolds()); for (auto& losses : m_FoldRoundTestLosses) { losses.resize(m_NumberRounds); } @@ -523,7 +522,7 @@ CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { // We want to ensure we evaluate on equal proportions for each fold. - TSizeVec folds(m_NumberFolds); + TSizeVec folds(this->numberFolds()); std::iota(folds.begin(), folds.end(), 0); CSampling::random_shuffle(m_Rng, folds.begin(), folds.end()); @@ -533,8 +532,8 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { // that the test error is not close to the minimum test error. We use // the estimated test error for each remaining fold at two standard // deviations below the mean for this. - if (m_StopCrossValidationEarly && m_CurrentRound >= m_NumberFolds && - folds.size() < m_NumberFolds) { + if (m_StopCrossValidationEarly && m_CurrentRound >= this->numberFolds() && + folds.size() < this->numberFolds()) { for (const auto& testLoss : this->estimateMissingTestLosses(folds)) { testLossMoments.add( CBasicStatistics::mean(testLoss) - @@ -547,7 +546,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { TMeanVarAccumulator lossMoments; TDoubleVec numberTrees; - numberTrees.reserve(m_NumberFolds); + numberTrees.reserve(this->numberFolds()); TMeanAccumulator meanForestSizeAccumulator; while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) { @@ -973,12 +972,13 @@ double CBoostedTreeImpl::minimumTestLoss() const { TMinAccumulator minimumTestLoss; for (std::size_t round = 0; round < m_CurrentRound - 1; ++round) { TMeanVarAccumulator roundLossMoments; - for (std::size_t fold = 0; fold < m_NumberFolds; ++fold) { + for (std::size_t fold = 0; fold < this->numberFolds(); ++fold) { if (m_FoldRoundTestLosses[fold][round] != boost::none) { roundLossMoments.add(*m_FoldRoundTestLosses[fold][round]); } } - if (static_cast(CBasicStatistics::count(roundLossMoments)) == m_NumberFolds) { + if (static_cast(CBasicStatistics::count(roundLossMoments)) == + this->numberFolds()) { minimumTestLoss.add(CBasicStatistics::mean(roundLossMoments)); } } @@ -1027,7 +1027,7 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const { // where the indices range over the folds for which we have errors in the // current round. - TSizeVec present(m_NumberFolds); + TSizeVec present(this->numberFolds()); std::iota(present.begin(), present.end(), 0); TSizeVec ordered{missing}; std::sort(ordered.begin(), ordered.end()); @@ -1478,6 +1478,10 @@ void CBoostedTreeImpl::scaleRegularizers(double scale) { } } +std::size_t CBoostedTreeImpl::numberFolds() const { + return static_cast(std::ceil(m_FractionalFolds)); +} + std::size_t CBoostedTreeImpl::numberHyperparametersToTune() const { return m_RegularizationOverride.countNotSet() + (m_DownsampleFactorOverride != boost::none ? 0 : 1) + @@ -1500,7 +1504,7 @@ void CBoostedTreeImpl::recordHyperparameters() { m_Instrumentation->hyperparameters().s_Eta = m_Eta; m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective; m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor; - m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds; + m_Instrumentation->hyperparameters().s_NumFolds = m_FractionalFolds; m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees; m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction; m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree; @@ -1583,13 +1587,13 @@ void CBoostedTreeImpl::startProgressMonitoringFineTuneHyperparameters() { m_Instrumentation->startNewProgressMonitoredTask(CBoostedTreeFactory::FINE_TUNING_PARAMETERS); - std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * m_NumberFolds}; + std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * this->numberFolds()}; LOG_TRACE(<< "main loop total number steps = " << totalNumberSteps); m_TrainingProgress = core::CLoopProgress{ totalNumberSteps, m_Instrumentation->progressCallback(), 1.0, 1024}; // Make sure progress starts where it left off. - m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * m_NumberFolds); + m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * this->numberFolds()); } void CBoostedTreeImpl::startProgressMonitoringFinalTrain() { @@ -1629,6 +1633,7 @@ const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; const std::string FEATURE_DATA_TYPES_TAG{"feature_data_types"}; const std::string FEATURE_SAMPLE_PROBABILITIES_TAG{"feature_sample_probabilities"}; const std::string FOLD_ROUND_TEST_LOSSES_TAG{"fold_round_test_losses"}; +const std::string FRACTIONAL_FOLDS_TAG{"number_folds"}; const std::string INITIALIZATION_STAGE_TAG{"initialization_progress"}; const std::string LOSS_TAG{"loss"}; const std::string LOSS_NAME_TAG{"loss_name"}; @@ -1640,7 +1645,6 @@ const std::string MAXIMUM_OPTIMISATION_ROUNDS_PER_HYPERPARAMETER_TAG{ const std::string MEAN_FOREST_SIZE_ACCUMULATOR_TAG{"mean_forest_size"}; const std::string MEAN_LOSS_ACCUMULATOR_TAG{"mean_loss"}; const std::string MISSING_FEATURE_ROW_MASKS_TAG{"missing_feature_row_masks"}; -const std::string NUMBER_FOLDS_TAG{"number_folds"}; const std::string NUMBER_FOLDS_OVERRIDE_TAG{"number_folds_override"}; const std::string NUMBER_ROUNDS_TAG{"number_rounds"}; const std::string NUMBER_SPLITS_PER_FEATURE_TAG{"number_splits_per_feature"}; @@ -1704,6 +1708,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(FEATURE_SAMPLE_PROBABILITIES_TAG, m_FeatureSampleProbabilities, inserter); core::CPersistUtils::persist(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, inserter); + core::CPersistUtils::persist(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, inserter); core::CPersistUtils::persist(INITIALIZATION_STAGE_TAG, static_cast(m_InitializationStage), inserter); if (m_Loss != nullptr) { @@ -1723,7 +1728,6 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(MEAN_LOSS_ACCUMULATOR_TAG, m_MeanLossAccumulator, inserter); core::CPersistUtils::persist(MISSING_FEATURE_ROW_MASKS_TAG, m_MissingFeatureRowMasks, inserter); - core::CPersistUtils::persist(NUMBER_FOLDS_TAG, m_NumberFolds, inserter); core::CPersistUtils::persist(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, inserter); core::CPersistUtils::persist(NUMBER_ROUNDS_TAG, m_NumberRounds, inserter); core::CPersistUtils::persist(NUMBER_SPLITS_PER_FEATURE_TAG, @@ -1820,6 +1824,8 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(FOLD_ROUND_TEST_LOSSES_TAG, core::CPersistUtils::restore(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, traverser)) + RESTORE(FRACTIONAL_FOLDS_TAG, + core::CPersistUtils::restore(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, traverser)) RESTORE(INITIALIZATION_STAGE_TAG, core::CPersistUtils::restore(INITIALIZATION_STAGE_TAG, initializationStage, traverser)) @@ -1846,8 +1852,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(MISSING_FEATURE_ROW_MASKS_TAG, core::CPersistUtils::restore(MISSING_FEATURE_ROW_MASKS_TAG, m_MissingFeatureRowMasks, traverser)) - RESTORE(NUMBER_FOLDS_TAG, - core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser)) RESTORE(NUMBER_FOLDS_OVERRIDE_TAG, core::CPersistUtils::restore(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, traverser)) @@ -1909,7 +1913,7 @@ void CBoostedTreeImpl::checkRestoredInvariants() const { VIOLATES_INVARIANT(m_TunableHyperparameters.size(), ==, samples.size()); } if (m_FoldRoundTestLosses.size() > 0) { - VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, m_NumberFolds); + VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, this->numberFolds()); for (const auto& losses : m_FoldRoundTestLosses) { VIOLATES_INVARIANT(losses.size(), ==, m_NumberRounds); } diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index 26759110fd..fed947f074 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -494,66 +494,74 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, const core::CDataFrame& frame, std::size_t targetColumn, CPRNG::CXorOShiro128Plus rng, - std::size_t numberFolds, + double numberFolds, std::size_t numberBuckets, const core::CPackedBitVector& allTrainingRowsMask) { TDoubleVec frequencies; TStratifiedSamplerPtr sampler; - double numberTrainingRows{allTrainingRowsMask.manhattan()}; - if (numberTrainingRows < 2.0) { + double numberRows{allTrainingRowsMask.manhattan()}; + if (numberRows < std::max(numberFolds, 2.0)) { HANDLE_FATAL(<< "Input error: unsufficient training data provided."); return {{}, {}, {}}; } - std::size_t desiredCount{ - (static_cast(numberTrainingRows) + numberFolds / 2) / numberFolds}; + // We sample the smaller of the test/train sets in the loop. + std::size_t numberTrainingRows{static_cast( + 1.0 - (numberFolds - 1.0) / numberFolds * numberRows + 0.5)}; + std::size_t numberTestingRows{static_cast(numberRows) - numberTrainingRows}; + std::size_t sampleSize{std::min(numberTrainingRows, numberTestingRows)}; if (frame.columnIsCategorical()[targetColumn]) { std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler( - numberThreads, frame, targetColumn, rng, desiredCount, allTrainingRowsMask); + numberThreads, frame, targetColumn, rng, sampleSize, allTrainingRowsMask); } else { sampler = regressionStratifiedCrossValiationRowSampler( - numberThreads, frame, targetColumn, rng, desiredCount, - numberBuckets, allTrainingRowsMask); + numberThreads, frame, targetColumn, rng, sampleSize, numberBuckets, + allTrainingRowsMask); } LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan()); - TPackedBitVectorVec testingRowMasks(numberFolds); + TPackedBitVectorVec testingRowMasks(static_cast(std::ceil(numberFolds))); TSizeVec rowIndices; core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask}; - for (std::size_t fold = 0; fold < numberFolds - 1; ++fold) { - frame.readRows(1, 0, frame.numberRows(), - [&](const TRowItr& beginRows, const TRowItr& endRows) { - for (auto row = beginRows; row != endRows; ++row) { - sampler->sample(*row); - } - }, - &candidateTestingRowsMask); - sampler->finishSampling(rng, rowIndices); - std::sort(rowIndices.begin(), rowIndices.end()); - LOG_TRACE(<< "# row indices = " << rowIndices.size()); - - for (auto row : rowIndices) { - testingRowMasks[fold].extend(false, row - testingRowMasks[fold].size()); - testingRowMasks[fold].extend(true); + for (std::size_t fold = 0; fold < testingRowMasks.size(); ++fold) { + if (candidateTestingRowsMask.manhattan() < + static_cast(sampleSize - numberFolds)) { + frame.readRows(1, 0, frame.numberRows(), + [&](const TRowItr& beginRows, const TRowItr& endRows) { + for (auto row = beginRows; row != endRows; ++row) { + sampler->sample(*row); + } + }, + &candidateTestingRowsMask); + sampler->finishSampling(rng, rowIndices); + std::sort(rowIndices.begin(), rowIndices.end()); + LOG_TRACE(<< "# row indices = " << rowIndices.size()); + + for (auto row : rowIndices) { + testingRowMasks[fold].extend(false, row - testingRowMasks[fold].size()); + testingRowMasks[fold].extend(true); + } + testingRowMasks[fold].extend(false, allTrainingRowsMask.size() - + testingRowMasks[fold].size()); + } else { + testingRowMasks[fold] = candidateTestingRowsMask; } - testingRowMasks[fold].extend(false, allTrainingRowsMask.size() - - testingRowMasks[fold].size()); // We exclusive or here to remove the rows we've selected for the current - //test fold. This is equivalent to samplng without replacement + // test/train fold. This is equivalent to sampling without replacement. candidateTestingRowsMask ^= testingRowMasks[fold]; } - // Everything which is left. - testingRowMasks.back() = std::move(candidateTestingRowsMask); - LOG_TRACE(<< "# remaining rows = " << testingRowMasks.back().manhattan()); - TPackedBitVectorVec trainingRowMasks{complementRowMasks(testingRowMasks, allTrainingRowsMask)}; + if (numberTrainingRows < numberTestingRows) { + std::swap(trainingRowMasks, testingRowMasks); + } + return {std::move(trainingRowMasks), std::move(testingRowMasks), std::move(frequencies)}; } From 26070c4de30cad30e2fab43e3c5a3c94bb0608f6 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 1 Jul 2021 11:38:38 +0100 Subject: [PATCH 03/35] Allow one to disable fine tuning entirely for fast mode --- lib/api/CDataFrameTrainBoostedTreeRunner.cc | 10 +++++-- lib/maths/CBoostedTreeFactory.cc | 30 ++++++++++----------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc index 0e29efe898..4acbbecbe6 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,10 @@ namespace ml { namespace api { +namespace { +const std::size_t UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER{ + std::numeric_limits::max()}; +} const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::parameterReader() { static const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { @@ -96,7 +101,8 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( std::size_t maxTrees{parameters[MAX_TREES].fallback(std::size_t{0})}; std::size_t numberFolds{parameters[NUM_FOLDS].fallback(std::size_t{0})}; std::size_t numberRoundsPerHyperparameter{ - parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback(std::size_t{0})}; + parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback( + UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER)}; std::size_t bayesianOptimisationRestarts{ parameters[BAYESIAN_OPTIMISATION_RESTARTS].fallback(std::size_t{0})}; bool stopCrossValidationEarly{parameters[STOP_CROSS_VALIDATION_EARLY].fallback(true)}; @@ -192,7 +198,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( if (numberFolds > 1) { m_BoostedTreeFactory->numberFolds(numberFolds); } - if (numberRoundsPerHyperparameter > 0) { + if (numberRoundsPerHyperparameter != UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER) { m_BoostedTreeFactory->maximumOptimisationRoundsPerHyperparameter(numberRoundsPerHyperparameter); } if (bayesianOptimisationRestarts > 0) { diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 8218489619..ec093d66fe 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -359,8 +359,6 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { maximumTrainingDataConstraintNumberFolds); LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction << " # folds = " << m_TreeImpl->m_FractionalFolds); - } else { - m_TreeImpl->m_FractionalFolds = static_cast(*m_TreeImpl->m_NumberFoldsOverride); } } @@ -467,30 +465,20 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) { } void CBoostedTreeFactory::initializeHyperparametersSetup(core::CDataFrame& frame) { - if (m_TreeImpl->m_EtaOverride != boost::none) { - m_TreeImpl->m_Eta = *(m_TreeImpl->m_EtaOverride); - } else { + if (m_TreeImpl->m_EtaOverride == boost::none) { m_TreeImpl->m_Eta = computeEta(frame.numberColumns() - this->numberExtraColumnsForTrain()); m_TreeImpl->m_EtaGrowthRatePerTree = 1.0 + m_TreeImpl->m_Eta / 2.0; } - if (m_TreeImpl->m_EtaGrowthRatePerTreeOverride != boost::none) { - m_TreeImpl->m_EtaGrowthRatePerTree = *(m_TreeImpl->m_EtaGrowthRatePerTreeOverride); - } - - if (m_TreeImpl->m_MaximumNumberTreesOverride != boost::none) { - m_TreeImpl->m_MaximumNumberTrees = *(m_TreeImpl->m_MaximumNumberTreesOverride); - } else { + if (m_TreeImpl->m_MaximumNumberTreesOverride == boost::none) { // This needs to be tied to the learn rate to avoid bias. m_TreeImpl->m_MaximumNumberTrees = computeMaximumNumberTrees(m_TreeImpl->m_Eta); } double numberFeatures{static_cast(m_TreeImpl->m_Encoder->numberEncodedColumns())}; - if (m_TreeImpl->m_FeatureBagFractionOverride != boost::none) { - m_TreeImpl->m_FeatureBagFraction = *(m_TreeImpl->m_FeatureBagFractionOverride); - } else { + if (m_TreeImpl->m_FeatureBagFractionOverride == boost::none) { m_TreeImpl->m_FeatureBagFraction = std::min(m_TreeImpl->m_FeatureBagFraction, m_TreeImpl->m_TrainingRowMasks[0].manhattan() / @@ -1159,6 +1147,7 @@ CBoostedTreeFactory::classAssignmentObjective(CBoostedTree::EClassAssignmentObje CBoostedTreeFactory& CBoostedTreeFactory::classificationWeights(TStrDoublePrVec weights) { m_TreeImpl->m_ClassificationWeightsOverride = std::move(weights); + m_TreeImpl->m_ClassificationWeights = *m_TreeImpl->m_ClassificationWeightsOverride; return *this; } @@ -1177,6 +1166,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) { numberFolds = 2; } m_TreeImpl->m_NumberFoldsOverride = numberFolds; + m_TreeImpl->m_FractionalFolds = static_cast(numberFolds); return *this; } @@ -1209,6 +1199,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::downsampleFactor(double factor) { factor = 1.0; } m_TreeImpl->m_DownsampleFactorOverride = factor; + m_TreeImpl->m_DownsampleFactor = factor; return *this; } @@ -1218,6 +1209,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::depthPenaltyMultiplier(double depthPen depthPenaltyMultiplier = 0.0; } m_TreeImpl->m_RegularizationOverride.depthPenaltyMultiplier(depthPenaltyMultiplier); + m_TreeImpl->m_Regularization.depthPenaltyMultiplier(depthPenaltyMultiplier); return *this; } @@ -1227,6 +1219,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::treeSizePenaltyMultiplier(double treeS treeSizePenaltyMultiplier = 0.0; } m_TreeImpl->m_RegularizationOverride.treeSizePenaltyMultiplier(treeSizePenaltyMultiplier); + m_TreeImpl->m_Regularization.treeSizePenaltyMultiplier(treeSizePenaltyMultiplier); return *this; } @@ -1236,6 +1229,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::leafWeightPenaltyMultiplier(double lea leafWeightPenaltyMultiplier = 0.0; } m_TreeImpl->m_RegularizationOverride.leafWeightPenaltyMultiplier(leafWeightPenaltyMultiplier); + m_TreeImpl->m_Regularization.leafWeightPenaltyMultiplier(leafWeightPenaltyMultiplier); return *this; } @@ -1245,6 +1239,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::softTreeDepthLimit(double softTreeDept softTreeDepthLimit = MIN_SOFT_DEPTH_LIMIT; } m_TreeImpl->m_RegularizationOverride.softTreeDepthLimit(softTreeDepthLimit); + m_TreeImpl->m_Regularization.softTreeDepthLimit(softTreeDepthLimit); return *this; } @@ -1254,6 +1249,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::softTreeDepthTolerance(double softTree softTreeDepthTolerance = 0.01; } m_TreeImpl->m_RegularizationOverride.softTreeDepthTolerance(softTreeDepthTolerance); + m_TreeImpl->m_Regularization.softTreeDepthTolerance(softTreeDepthTolerance); return *this; } @@ -1268,6 +1264,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::eta(double eta) { eta = 1.0; } m_TreeImpl->m_EtaOverride = eta; + m_TreeImpl->m_Eta = eta; return *this; } @@ -1278,6 +1275,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::etaGrowthRatePerTree(double etaGrowthR etaGrowthRatePerTree = std::max(etaGrowthRatePerTree, MIN_ETA); } m_TreeImpl->m_EtaGrowthRatePerTreeOverride = etaGrowthRatePerTree; + m_TreeImpl->m_EtaGrowthRatePerTree = etaGrowthRatePerTree; return *this; } @@ -1292,6 +1290,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::maximumNumberTrees(std::size_t maximum maximumNumberTrees = std::min(maximumNumberTrees, MAX_NUMBER_TREES); } m_TreeImpl->m_MaximumNumberTreesOverride = maximumNumberTrees; + m_TreeImpl->m_MaximumNumberTrees = maximumNumberTrees; return *this; } @@ -1302,6 +1301,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::featureBagFraction(double featureBagFr featureBagFraction = CTools::truncate(featureBagFraction, 0.0, 1.0); } m_TreeImpl->m_FeatureBagFractionOverride = featureBagFraction; + m_TreeImpl->m_FeatureBagFraction = featureBagFraction; return *this; } From 81d3ffd3a7626380c1a6c6b351c21d308d789b6a Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 1 Jul 2021 17:51:56 +0100 Subject: [PATCH 04/35] Uncouple training fraction parameter from the number of folds --- .../api/CDataFrameTrainBoostedTreeRunner.h | 1 + include/maths/CBoostedTreeFactory.h | 4 +- include/maths/CBoostedTreeImpl.h | 7 +- ...ataFrameAnalysisInstrumentationInterface.h | 29 ++--- include/maths/CDataFrameUtils.h | 8 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 4 + lib/api/CDataFrameTrainBoostedTreeRunner.cc | 12 +- lib/maths/CBoostedTreeFactory.cc | 104 +++++++++--------- lib/maths/CBoostedTreeImpl.cc | 57 ++++++---- lib/maths/CDataFrameUtils.cc | 28 ++--- lib/maths/unittest/CBoostedTreeTest.cc | 64 +++++++++++ lib/maths/unittest/CDataFrameUtilsTest.cc | 8 +- 12 files changed, 207 insertions(+), 119 deletions(-) diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h index 7fe4ff7100..5dc3c87f21 100644 --- a/include/api/CDataFrameTrainBoostedTreeRunner.h +++ b/include/api/CDataFrameTrainBoostedTreeRunner.h @@ -48,6 +48,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun static const std::string MAX_TREES; static const std::string FEATURE_BAG_FRACTION; static const std::string NUM_FOLDS; + static const std::string TRAIN_FRACTION_PER_FOLD; static const std::string STOP_CROSS_VALIDATION_EARLY; static const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER; static const std::string BAYESIAN_OPTIMISATION_RESTARTS; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 74eeae688f..ca06cdf0f2 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -80,6 +80,8 @@ class MATHS_EXPORT CBoostedTreeFactory final { CBoostedTreeFactory& minimumFrequencyToOneHotEncode(double frequency); //! Set the number of folds to use for estimating the generalisation error. CBoostedTreeFactory& numberFolds(std::size_t numberFolds); + //! Set the fraction fold data to use for training. + CBoostedTreeFactory& trainFractionPerFold(double fraction); //! Set the maximum number of rows to use for training when tuning hyperparameters. CBoostedTreeFactory& maximumNumberTrainRows(std::size_t rows); //! Stratify the cross-validation we do for regression. @@ -277,7 +279,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { TOptionalSize m_BayesianOptimisationRestarts; bool m_StratifyRegressionCrossValidation = true; double m_InitialDownsampleRowsPerFeature = 200.0; - std::size_t m_MaximumNumberOfTrainRows = 1000000; + std::size_t m_MaximumNumberOfTrainRows = 750000; double m_GainPerNode1stPercentile = 0.0; double m_GainPerNode50thPercentile = 0.0; double m_GainPerNode90thPercentile = 0.0; diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index ad19e896c7..a2d948aa33 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -320,9 +320,6 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Check invariants which are assumed to hold in order to train on \p frame. void checkTrainInvariants(const core::CDataFrame& frame) const; - //! Get the count of train/validation folds. - std::size_t numberFolds() const; - //! Get the number of hyperparameters to tune. std::size_t numberHyperparametersToTune() const; @@ -375,6 +372,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { TOptionalDouble m_EtaOverride; TOptionalDouble m_EtaGrowthRatePerTreeOverride; TOptionalSize m_NumberFoldsOverride; + TOptionalSize m_TrainFractionPerFoldOverride; TOptionalSize m_MaximumNumberTreesOverride; TOptionalDouble m_FeatureBagFractionOverride; TOptionalStrDoublePrVec m_ClassificationWeightsOverride; @@ -383,7 +381,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { double m_DownsampleFactor = 0.5; double m_Eta = 0.1; double m_EtaGrowthRatePerTree = 1.05; - double m_FractionalFolds = 4.0; + std::size_t m_NumberFolds = 4; + double m_TrainFractionPerFold = 0.75; std::size_t m_MaximumNumberTrees = 20; std::size_t m_MaximumAttemptsToAddTree = 3; std::size_t m_NumberSplitsPerFeature = 75; diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 23a876dd84..f6b35916b0 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -103,25 +103,26 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance}, s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier}, s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {} - double s_DepthPenaltyMultiplier = -1.0; - double s_SoftTreeDepthLimit = -1.0; - double s_SoftTreeDepthTolerance = -1.0; - double s_TreeSizePenaltyMultiplier = -1.0; - double s_LeafWeightPenaltyMultiplier = -1.0; + double s_DepthPenaltyMultiplier{-1.0}; + double s_SoftTreeDepthLimit{-1.0}; + double s_SoftTreeDepthTolerance{-1.0}; + double s_TreeSizePenaltyMultiplier{-1.0}; + double s_LeafWeightPenaltyMultiplier{-1.0}; }; struct SHyperparameters { - double s_Eta = -1.0; + double s_Eta{-1.0}; CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective = CBoostedTree::E_MinimumRecall; SRegularization s_Regularization; - double s_DownsampleFactor = -1.0; - std::size_t s_NumFolds = 0; - std::size_t s_MaxTrees = 0; - double s_FeatureBagFraction = -1.0; - double s_EtaGrowthRatePerTree = -1.0; - std::size_t s_MaxAttemptsToAddTree = 0; - std::size_t s_NumSplitsPerFeature = 0; - std::size_t s_MaxOptimizationRoundsPerHyperparameter = 0; + double s_DownsampleFactor{-1.0}; + std::size_t s_NumFolds{0}; + double s_TrainFractionPerFold{0.0}; + std::size_t s_MaxTrees{0}; + double s_FeatureBagFraction{-1.0}; + double s_EtaGrowthRatePerTree{-1.0}; + std::size_t s_MaxAttemptsToAddTree{0}; + std::size_t s_NumSplitsPerFeature{0}; + std::size_t s_MaxOptimizationRoundsPerHyperparameter{0}; }; using TDoubleVec = std::vector; diff --git a/include/maths/CDataFrameUtils.h b/include/maths/CDataFrameUtils.h index 411a680455..0e08f1dca0 100644 --- a/include/maths/CDataFrameUtils.h +++ b/include/maths/CDataFrameUtils.h @@ -280,9 +280,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { //! \param[in] frame The data frame for which to compute the row masks. //! \param[in] targetColumn The index of the column to predict. //! \param[in] rng The random number generator to use. - //! \param[in] numberFolds The number of folds to use. If this is less than - //! two, there will be two train masks, but their size will be less than 50% - //! of the data. + //! \param[in] numberFolds The number of folds to use. + //! \param[in] trainFractionPerFold The fraction of train data to use per fold. //! \param[in] numberBuckets The number of buckets to use when stratifying //! by target quantiles for regression. //! \param[in] allTrainingRowsMask A mask of the candidate training rows. @@ -292,7 +291,8 @@ class MATHS_EXPORT CDataFrameUtils : private core::CNonInstantiatable { const core::CDataFrame& frame, std::size_t targetColumn, CPRNG::CXorOShiro128Plus rng, - double numberFolds, + std::size_t numberFolds, + double trainFractionPerFold, std::size_t numberBuckets, const core::CPackedBitVector& allTrainingRowsMask); diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index ec1c18e9f5..5d059cea08 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -483,6 +483,10 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson:: rapidjson::Value(static_cast(this->m_Hyperparameters.s_NumFolds)) .Move(), parentObject); + writer->addMember( + CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD, + rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(), + parentObject); writer->addMember( CDataFrameTrainBoostedTreeRunner::MAX_TREES, rapidjson::Value(static_cast(this->m_Hyperparameters.s_MaxTrees)) diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc index 4acbbecbe6..b10932a5b2 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc @@ -14,20 +14,21 @@ #include #include -#include #include #include #include #include +#include #include #include #include #include -#include #include +#include + namespace ml { namespace api { namespace { @@ -60,6 +61,8 @@ const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::paramete theReader.addParameter(FEATURE_BAG_FRACTION, CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(NUM_FOLDS, CDataFrameAnalysisConfigReader::E_OptionalParameter); + theReader.addParameter(TRAIN_FRACTION_PER_FOLD, + CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(STOP_CROSS_VALIDATION_EARLY, CDataFrameAnalysisConfigReader::E_OptionalParameter); theReader.addParameter(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER, @@ -100,6 +103,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( std::size_t maxTrees{parameters[MAX_TREES].fallback(std::size_t{0})}; std::size_t numberFolds{parameters[NUM_FOLDS].fallback(std::size_t{0})}; + double trainFractionPerFold{parameters[TRAIN_FRACTION_PER_FOLD].fallback(-1.0)}; std::size_t numberRoundsPerHyperparameter{ parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback( UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER)}; @@ -198,6 +202,9 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( if (numberFolds > 1) { m_BoostedTreeFactory->numberFolds(numberFolds); } + if (trainFractionPerFold > 0.0) { + m_BoostedTreeFactory->trainFractionPerFold(trainFractionPerFold); + } if (numberRoundsPerHyperparameter != UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER) { m_BoostedTreeFactory->maximumOptimisationRoundsPerHyperparameter(numberRoundsPerHyperparameter); } @@ -394,6 +401,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::SOFT_TREE_DEPTH_TOLERANCE{"s const std::string CDataFrameTrainBoostedTreeRunner::MAX_TREES{"max_trees"}; const std::string CDataFrameTrainBoostedTreeRunner::FEATURE_BAG_FRACTION{"feature_bag_fraction"}; const std::string CDataFrameTrainBoostedTreeRunner::NUM_FOLDS{"num_folds"}; +const std::string CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD{"train_fraction_per_fold"}; const std::string CDataFrameTrainBoostedTreeRunner::STOP_CROSS_VALIDATION_EARLY{"stop_cross_validation_early"}; const std::string CDataFrameTrainBoostedTreeRunner::MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER{"max_optimization_rounds_per_hyperparameter"}; const std::string CDataFrameTrainBoostedTreeRunner::BAYESIAN_OPTIMISATION_RESTARTS{"bayesian_optimisation_restarts"}; diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index ec093d66fe..821991d496 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -104,9 +104,6 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari ? this->skipProgressMonitoringFeatureSelection() : this->startProgressMonitoringFeatureSelection(); - // Find the maximum number of rows at which the selected tree depth does not change significantly. - // Need to call hyperparameter set up first. - skipIfAfter(CBoostedTreeImpl::E_NotInitialized, [&] { this->initializeCrossValidation(frame); }); skipIfAfter(CBoostedTreeImpl::E_NotInitialized, @@ -295,25 +292,25 @@ void CBoostedTreeFactory::initializeMissingFeatureMasks(const core::CDataFrame& void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { - if (m_TreeImpl->m_NumberFoldsOverride == boost::none) { - auto result = frame.readRows( - m_NumberThreads, - core::bindRetrievableState( - [this](std::size_t& numberTrainingRows, - const TRowItr& beginRows, const TRowItr& endRows) { - for (auto row = beginRows; row != endRows; ++row) { - double target{(*row)[m_TreeImpl->m_DependentVariable]}; - if (CDataFrameUtils::isMissing(target) == false) { - ++numberTrainingRows; - } + auto result = frame.readRows( + m_NumberThreads, + core::bindRetrievableState( + [this](std::size_t& numberTrainingRows, const TRowItr& beginRows, const TRowItr& endRows) { + for (auto row = beginRows; row != endRows; ++row) { + double target{(*row)[m_TreeImpl->m_DependentVariable]}; + if (CDataFrameUtils::isMissing(target) == false) { + ++numberTrainingRows; } - }, - std::size_t{0})); - std::size_t totalNumberTrainingRows{0}; - for (const auto& numberTrainingRows : result.first) { - totalNumberTrainingRows += numberTrainingRows.s_FunctionState; - } - LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows); + } + }, + std::size_t{0})); + std::size_t totalNumberTrainingRows{0}; + for (const auto& numberTrainingRows : result.first) { + totalNumberTrainingRows += numberTrainingRows.s_FunctionState; + } + LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows); + + if (m_TreeImpl->m_NumberFoldsOverride == boost::none) { // We want to choose the number of folds so we'll have enough training data // after leaving out one fold. We choose the initial downsample size based @@ -335,31 +332,25 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { // // In addition, we want to constrain the maximum amount of training data we'll // use during hyperparameter search to avoid very long run times. To do this - // we set the number of folds to be less than two. We define the size of the - // training data set to be (k - 1) / k * # rows, with k the number of folds. - // If k < 2 this means we end up selecting less than half the data for training. - // To meet the constraint on the maximum number of rows M we must choose k - // which satisfies M >= (k - 1) / k * # rows. This is trivially satisfied for - // # rows less than M and, given we also constrain the maximum number of folds, - // we only care if # rows > MAX_NUMBER_FOLDS * M / (MAX_NUMBER_FOLDS - 1). + // we use less than the implied 1 - 1/k : 1/k train : test split when it results + // in more train rows than the defined maximum. double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature * static_cast(frame.numberColumns() - 1)) / static_cast(totalNumberTrainingRows)}; - double minimumTrainingDataConstraintNumberFolds{ + LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction); + m_TreeImpl->m_NumberFolds = static_cast( std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION, - 1.0 / MAX_NUMBER_FOLDS))}; - double maximumTrainingDataConstraintNumberFolds{ - 1.0 / (1.0 - static_cast(m_MaximumNumberOfTrainRows) / - std::max(static_cast(frame.numberRows()), - MAX_NUMBER_FOLDS / (MAX_NUMBER_FOLDS - 1.0) * - static_cast(m_MaximumNumberOfTrainRows)))}; - - m_TreeImpl->m_FractionalFolds = std::min(minimumTrainingDataConstraintNumberFolds, - maximumTrainingDataConstraintNumberFolds); - LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction - << " # folds = " << m_TreeImpl->m_FractionalFolds); + 1.0 / MAX_NUMBER_FOLDS))); + } + if (m_TreeImpl->m_TrainFractionPerFoldOverride == boost::none) { + m_TreeImpl->m_TrainFractionPerFold = + std::min(1.0 - 1.0 / static_cast(m_TreeImpl->m_NumberFolds), + static_cast(m_MaximumNumberOfTrainRows) / + static_cast(totalNumberTrainingRows)); } + LOG_TRACE(<< "# folds = " << m_TreeImpl->m_NumberFolds + << ", train fraction per fold = " << m_TreeImpl->m_TrainFractionPerFold); } void CBoostedTreeFactory::resizeDataFrame(core::CDataFrame& frame) const { @@ -390,8 +381,9 @@ void CBoostedTreeFactory::initializeCrossValidation(core::CDataFrame& frame) con std::size_t numberBuckets(m_StratifyRegressionCrossValidation ? 10 : 1); std::tie(m_TreeImpl->m_TrainingRowMasks, m_TreeImpl->m_TestingRowMasks, std::ignore) = CDataFrameUtils::stratifiedCrossValidationRowMasks( - m_TreeImpl->m_NumberThreads, frame, dependentVariable, m_TreeImpl->m_Rng, - m_TreeImpl->m_FractionalFolds, numberBuckets, allTrainingRowsMask); + m_TreeImpl->m_NumberThreads, frame, dependentVariable, + m_TreeImpl->m_Rng, m_TreeImpl->m_NumberFolds, + m_TreeImpl->m_TrainFractionPerFold, numberBuckets, allTrainingRowsMask); } void CBoostedTreeFactory::selectFeaturesAndEncodeCategories(const core::CDataFrame& frame) const { @@ -1080,26 +1072,21 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, } std::sort(testLosses.begin(), testLosses.end()); - LOG_INFO(<< "test losses = " << core::CContainerPrinter::print(testLosses)); + LOG_TRACE(<< "test losses = " << core::CContainerPrinter::print(testLosses)); CLowess<2> lowess; lowess.fit(std::move(testLosses), testLosses.size()); - double bestParameter, bestParameterTestLoss; + double bestParameter; + double bestParameterTestLoss; std::tie(bestParameter, bestParameterTestLoss) = lowess.minimum(); - LOG_INFO(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss); + LOG_TRACE(<< "best parameter = " << bestParameter << ", test loss = " << bestParameterTestLoss); double width{(intervalRightEnd - intervalLeftEnd) / static_cast(MAX_LINE_SEARCH_ITERATIONS)}; intervalLeftEnd = bestParameter - width; intervalRightEnd = bestParameter + width; - LOG_INFO(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]"); - //double residualVariance{lowess.residualVariance()}; - //std::tie(intervalLeftEnd, intervalRightEnd) = - // lowess.sublevelSet(bestParameter, bestParameterTestLoss, - // bestParameterTestLoss + std::sqrt(residualVariance)); - //LOG_INFO(<< "residual variance = " << residualVariance << " interval = [" - // << intervalLeftEnd << "," << intervalRightEnd << "]"); + LOG_TRACE(<< "interval = [" << intervalLeftEnd << "," << intervalRightEnd << "]"); return TVector{{intervalLeftEnd, bestParameter, intervalRightEnd}}; } @@ -1147,7 +1134,6 @@ CBoostedTreeFactory::classAssignmentObjective(CBoostedTree::EClassAssignmentObje CBoostedTreeFactory& CBoostedTreeFactory::classificationWeights(TStrDoublePrVec weights) { m_TreeImpl->m_ClassificationWeightsOverride = std::move(weights); - m_TreeImpl->m_ClassificationWeights = *m_TreeImpl->m_ClassificationWeightsOverride; return *this; } @@ -1166,7 +1152,17 @@ CBoostedTreeFactory& CBoostedTreeFactory::numberFolds(std::size_t numberFolds) { numberFolds = 2; } m_TreeImpl->m_NumberFoldsOverride = numberFolds; - m_TreeImpl->m_FractionalFolds = static_cast(numberFolds); + m_TreeImpl->m_NumberFolds = numberFolds; + return *this; +} + +CBoostedTreeFactory& CBoostedTreeFactory::trainFractionPerFold(double fraction) { + if (fraction <= 0.0 || fraction >= 1.0) { + LOG_WARN(<< "Training data fraction " << fraction << " per fold out of range"); + } else { + m_TreeImpl->m_TrainFractionPerFoldOverride = fraction; + m_TreeImpl->m_TrainFractionPerFold = fraction; + } return *this; } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 07d9b27ec5..93d2560434 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -334,7 +334,7 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, m_MaximumNumberTrees * (sizeof(TNodeVec) + maximumNumberNodes * CBoostedTreeNode::estimateMemoryUsage( m_Loss->numberParameters()))}; - std::size_t foldRoundLossMemoryUsage{this->numberFolds() * m_NumberRounds * + std::size_t foldRoundLossMemoryUsage{m_NumberFolds * m_NumberRounds * sizeof(TOptionalDouble)}; std::size_t hyperparametersMemoryUsage{numberColumns * sizeof(double)}; std::size_t tunableHyperparametersMemoryUsage{ @@ -367,7 +367,9 @@ std::size_t CBoostedTreeImpl::estimateMemoryUsage(std::size_t numberRows, // we get a constant 8 / 64. std::size_t missingFeatureMaskMemoryUsage{8 * numberColumns * numberRows / 64}; std::size_t trainTestMaskMemoryUsage{ - 2 * static_cast(std::ceil(std::log2(m_FractionalFolds))) * numberRows}; + 2 * m_NumberFolds * + static_cast(std::ceil( + std::min(m_TrainFractionPerFold, 1.0 - m_TrainFractionPerFold) * numberRows))}; std::size_t bayesianOptimisationMemoryUsage{CBayesianOptimisation::estimateMemoryUsage( this->numberHyperparametersToTune(), m_NumberRounds)}; std::size_t worstCaseMemoryUsage{ @@ -435,7 +437,7 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile, } void CBoostedTreeImpl::initializePerFoldTestLosses() { - m_FoldRoundTestLosses.resize(this->numberFolds()); + m_FoldRoundTestLosses.resize(m_NumberFolds); for (auto& losses : m_FoldRoundTestLosses) { losses.resize(m_NumberRounds); } @@ -522,7 +524,7 @@ CBoostedTreeImpl::TMeanVarAccumulatorSizeDoubleTuple CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { // We want to ensure we evaluate on equal proportions for each fold. - TSizeVec folds(this->numberFolds()); + TSizeVec folds(m_NumberFolds); std::iota(folds.begin(), folds.end(), 0); CSampling::random_shuffle(m_Rng, folds.begin(), folds.end()); @@ -532,8 +534,8 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { // that the test error is not close to the minimum test error. We use // the estimated test error for each remaining fold at two standard // deviations below the mean for this. - if (m_StopCrossValidationEarly && m_CurrentRound >= this->numberFolds() && - folds.size() < this->numberFolds()) { + if (m_StopCrossValidationEarly && m_CurrentRound >= m_NumberFolds && + folds.size() < m_NumberFolds) { for (const auto& testLoss : this->estimateMissingTestLosses(folds)) { testLossMoments.add( CBasicStatistics::mean(testLoss) - @@ -546,7 +548,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { TMeanVarAccumulator lossMoments; TDoubleVec numberTrees; - numberTrees.reserve(this->numberFolds()); + numberTrees.reserve(m_NumberFolds); TMeanAccumulator meanForestSizeAccumulator; while (folds.size() > 0 && stopCrossValidationEarly(lossMoments) == false) { @@ -972,13 +974,12 @@ double CBoostedTreeImpl::minimumTestLoss() const { TMinAccumulator minimumTestLoss; for (std::size_t round = 0; round < m_CurrentRound - 1; ++round) { TMeanVarAccumulator roundLossMoments; - for (std::size_t fold = 0; fold < this->numberFolds(); ++fold) { + for (std::size_t fold = 0; fold < m_NumberFolds; ++fold) { if (m_FoldRoundTestLosses[fold][round] != boost::none) { roundLossMoments.add(*m_FoldRoundTestLosses[fold][round]); } } - if (static_cast(CBasicStatistics::count(roundLossMoments)) == - this->numberFolds()) { + if (static_cast(CBasicStatistics::count(roundLossMoments)) == m_NumberFolds) { minimumTestLoss.add(CBasicStatistics::mean(roundLossMoments)); } } @@ -1027,7 +1028,7 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const { // where the indices range over the folds for which we have errors in the // current round. - TSizeVec present(this->numberFolds()); + TSizeVec present(m_NumberFolds); std::iota(present.begin(), present.end(), 0); TSizeVec ordered{missing}; std::sort(ordered.begin(), ordered.end()); @@ -1478,10 +1479,6 @@ void CBoostedTreeImpl::scaleRegularizers(double scale) { } } -std::size_t CBoostedTreeImpl::numberFolds() const { - return static_cast(std::ceil(m_FractionalFolds)); -} - std::size_t CBoostedTreeImpl::numberHyperparametersToTune() const { return m_RegularizationOverride.countNotSet() + (m_DownsampleFactorOverride != boost::none ? 0 : 1) + @@ -1504,7 +1501,8 @@ void CBoostedTreeImpl::recordHyperparameters() { m_Instrumentation->hyperparameters().s_Eta = m_Eta; m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective; m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor; - m_Instrumentation->hyperparameters().s_NumFolds = m_FractionalFolds; + m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds; + m_Instrumentation->hyperparameters().s_TrainFractionPerFold = m_TrainFractionPerFold; m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees; m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction; m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree; @@ -1587,13 +1585,13 @@ void CBoostedTreeImpl::startProgressMonitoringFineTuneHyperparameters() { m_Instrumentation->startNewProgressMonitoredTask(CBoostedTreeFactory::FINE_TUNING_PARAMETERS); - std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * this->numberFolds()}; + std::size_t totalNumberSteps{m_NumberRounds * m_MaximumNumberTrees * m_NumberFolds}; LOG_TRACE(<< "main loop total number steps = " << totalNumberSteps); m_TrainingProgress = core::CLoopProgress{ totalNumberSteps, m_Instrumentation->progressCallback(), 1.0, 1024}; // Make sure progress starts where it left off. - m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * this->numberFolds()); + m_TrainingProgress.increment(m_CurrentRound * m_MaximumNumberTrees * m_NumberFolds); } void CBoostedTreeImpl::startProgressMonitoringFinalTrain() { @@ -1633,7 +1631,6 @@ const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; const std::string FEATURE_DATA_TYPES_TAG{"feature_data_types"}; const std::string FEATURE_SAMPLE_PROBABILITIES_TAG{"feature_sample_probabilities"}; const std::string FOLD_ROUND_TEST_LOSSES_TAG{"fold_round_test_losses"}; -const std::string FRACTIONAL_FOLDS_TAG{"number_folds"}; const std::string INITIALIZATION_STAGE_TAG{"initialization_progress"}; const std::string LOSS_TAG{"loss"}; const std::string LOSS_NAME_TAG{"loss_name"}; @@ -1646,6 +1643,7 @@ const std::string MEAN_FOREST_SIZE_ACCUMULATOR_TAG{"mean_forest_size"}; const std::string MEAN_LOSS_ACCUMULATOR_TAG{"mean_loss"}; const std::string MISSING_FEATURE_ROW_MASKS_TAG{"missing_feature_row_masks"}; const std::string NUMBER_FOLDS_OVERRIDE_TAG{"number_folds_override"}; +const std::string NUMBER_FOLDS_TAG{"number_folds"}; const std::string NUMBER_ROUNDS_TAG{"number_rounds"}; const std::string NUMBER_SPLITS_PER_FEATURE_TAG{"number_splits_per_feature"}; const std::string NUMBER_THREADS_TAG{"number_threads"}; @@ -1656,6 +1654,8 @@ const std::string ROWS_PER_FEATURE_TAG{"rows_per_feature"}; const std::string STOP_CROSS_VALIDATION_EARLY_TAG{"stop_cross_validation_eraly"}; const std::string TESTING_ROW_MASKS_TAG{"testing_row_masks"}; const std::string TRAINING_ROW_MASKS_TAG{"training_row_masks"}; +const std::string TRAIN_FRACTION_PER_FOLD_TAG{"train_fraction_per_folds"}; +const std::string TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG{"train_fraction_per_folds_override"}; const std::string NUMBER_TOP_SHAP_VALUES_TAG{"top_shap_values"}; const std::string STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG{"stop_hyperparameter_optimization_early"}; } @@ -1708,7 +1708,6 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(FEATURE_SAMPLE_PROBABILITIES_TAG, m_FeatureSampleProbabilities, inserter); core::CPersistUtils::persist(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, inserter); - core::CPersistUtils::persist(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, inserter); core::CPersistUtils::persist(INITIALIZATION_STAGE_TAG, static_cast(m_InitializationStage), inserter); if (m_Loss != nullptr) { @@ -1729,6 +1728,7 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert core::CPersistUtils::persist(MISSING_FEATURE_ROW_MASKS_TAG, m_MissingFeatureRowMasks, inserter); core::CPersistUtils::persist(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, inserter); + core::CPersistUtils::persist(NUMBER_FOLDS_TAG, m_NumberFolds, inserter); core::CPersistUtils::persist(NUMBER_ROUNDS_TAG, m_NumberRounds, inserter); core::CPersistUtils::persist(NUMBER_SPLITS_PER_FEATURE_TAG, m_NumberSplitsPerFeature, inserter); @@ -1743,6 +1743,9 @@ void CBoostedTreeImpl::acceptPersistInserter(core::CStatePersistInserter& insert m_StopCrossValidationEarly, inserter); core::CPersistUtils::persist(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, inserter); core::CPersistUtils::persist(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, inserter); + core::CPersistUtils::persist(TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG, + m_TrainFractionPerFoldOverride, inserter); + core::CPersistUtils::persist(TRAIN_FRACTION_PER_FOLD_TAG, m_TrainFractionPerFold, inserter); core::CPersistUtils::persist(STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG, m_StopHyperparameterOptimizationEarly, inserter); // m_TunableHyperparameters is not persisted explicitly, it is restored from overriden hyperparameters @@ -1824,8 +1827,6 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(FOLD_ROUND_TEST_LOSSES_TAG, core::CPersistUtils::restore(FOLD_ROUND_TEST_LOSSES_TAG, m_FoldRoundTestLosses, traverser)) - RESTORE(FRACTIONAL_FOLDS_TAG, - core::CPersistUtils::restore(FRACTIONAL_FOLDS_TAG, m_FractionalFolds, traverser)) RESTORE(INITIALIZATION_STAGE_TAG, core::CPersistUtils::restore(INITIALIZATION_STAGE_TAG, initializationStage, traverser)) @@ -1855,6 +1856,8 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav RESTORE(NUMBER_FOLDS_OVERRIDE_TAG, core::CPersistUtils::restore(NUMBER_FOLDS_OVERRIDE_TAG, m_NumberFoldsOverride, traverser)) + RESTORE(NUMBER_FOLDS_TAG, + core::CPersistUtils::restore(NUMBER_FOLDS_TAG, m_NumberFolds, traverser)) RESTORE(NUMBER_ROUNDS_TAG, core::CPersistUtils::restore(NUMBER_ROUNDS_TAG, m_NumberRounds, traverser)) RESTORE(NUMBER_SPLITS_PER_FEATURE_TAG, @@ -1880,6 +1883,12 @@ bool CBoostedTreeImpl::acceptRestoreTraverser(core::CStateRestoreTraverser& trav core::CPersistUtils::restore(TESTING_ROW_MASKS_TAG, m_TestingRowMasks, traverser)) RESTORE(TRAINING_ROW_MASKS_TAG, core::CPersistUtils::restore(TRAINING_ROW_MASKS_TAG, m_TrainingRowMasks, traverser)) + RESTORE(TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG, + core::CPersistUtils::restore(TRAIN_FRACTION_PER_FOLD_OVERRIDE_TAG, + m_TrainFractionPerFoldOverride, traverser)) + RESTORE(TRAIN_FRACTION_PER_FOLD_TAG, + core::CPersistUtils::restore(TRAIN_FRACTION_PER_FOLD_TAG, + m_TrainFractionPerFold, traverser)) RESTORE(STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG, core::CPersistUtils::restore(STOP_HYPERPARAMETER_OPTIMIZATION_EARLY_TAG, m_StopHyperparameterOptimizationEarly, traverser)) @@ -1913,7 +1922,7 @@ void CBoostedTreeImpl::checkRestoredInvariants() const { VIOLATES_INVARIANT(m_TunableHyperparameters.size(), ==, samples.size()); } if (m_FoldRoundTestLosses.size() > 0) { - VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, this->numberFolds()); + VIOLATES_INVARIANT(m_FoldRoundTestLosses.size(), ==, m_NumberFolds); for (const auto& losses : m_FoldRoundTestLosses) { VIOLATES_INVARIANT(losses.size(), ==, m_NumberRounds); } @@ -2023,7 +2032,7 @@ CBoostedTreeImpl::hyperparameterImportance() const { double hyperparameterValue; SHyperparameterImportance::EType hyperparameterType{ boosted_tree_detail::SHyperparameterImportance::E_Double}; - switch (i) { + switch (static_cast(i)) { case E_Alpha: hyperparameterValue = m_Regularization.depthPenaltyMultiplier(); break; diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index fed947f074..4b5cc3cd94 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -494,23 +494,24 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, const core::CDataFrame& frame, std::size_t targetColumn, CPRNG::CXorOShiro128Plus rng, - double numberFolds, + std::size_t numberFolds, + double trainFractionPerFold, std::size_t numberBuckets, const core::CPackedBitVector& allTrainingRowsMask) { TDoubleVec frequencies; TStratifiedSamplerPtr sampler; - double numberRows{allTrainingRowsMask.manhattan()}; - if (numberRows < std::max(numberFolds, 2.0)) { + double numberTrainingRows{allTrainingRowsMask.manhattan()}; + if (static_cast(numberTrainingRows) < numberFolds) { HANDLE_FATAL(<< "Input error: unsufficient training data provided."); return {{}, {}, {}}; } - // We sample the smaller of the test/train sets in the loop. - std::size_t numberTrainingRows{static_cast( - 1.0 - (numberFolds - 1.0) / numberFolds * numberRows + 0.5)}; - std::size_t numberTestingRows{static_cast(numberRows) - numberTrainingRows}; - std::size_t sampleSize{std::min(numberTrainingRows, numberTestingRows)}; + // We sample the smaller of the test or train set in the loop. + std::size_t sampleSize{static_cast( + std::min(trainFractionPerFold, 1.0 - trainFractionPerFold) * numberTrainingRows + 0.5)}; + double minimumSizeToSample{static_cast(sampleSize + numberFolds)}; + LOG_TRACE(<< "sample size = " << sampleSize); if (frame.columnIsCategorical()[targetColumn]) { std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler( @@ -523,13 +524,14 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan()); - TPackedBitVectorVec testingRowMasks(static_cast(std::ceil(numberFolds))); + TPackedBitVectorVec testingRowMasks(numberFolds); TSizeVec rowIndices; core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask}; for (std::size_t fold = 0; fold < testingRowMasks.size(); ++fold) { - if (candidateTestingRowsMask.manhattan() < - static_cast(sampleSize - numberFolds)) { + if (candidateTestingRowsMask.manhattan() < minimumSizeToSample) { + testingRowMasks[fold] = candidateTestingRowsMask; + } else { frame.readRows(1, 0, frame.numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { @@ -547,8 +549,6 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, } testingRowMasks[fold].extend(false, allTrainingRowsMask.size() - testingRowMasks[fold].size()); - } else { - testingRowMasks[fold] = candidateTestingRowsMask; } // We exclusive or here to remove the rows we've selected for the current @@ -558,7 +558,7 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, TPackedBitVectorVec trainingRowMasks{complementRowMasks(testingRowMasks, allTrainingRowsMask)}; - if (numberTrainingRows < numberTestingRows) { + if (trainFractionPerFold < 0.5) { std::swap(trainingRowMasks, testingRowMasks); } diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index a05128368f..77bdbc19e8 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -692,6 +692,70 @@ BOOST_AUTO_TEST_CASE(testMsle) { // TODO #1744 test quality of MSLE on data with log-normal errors. } +BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) { + + // Test regression using a very low train fraction per fold. This should + // run in seconds, but we don't assert on the runtime because we don't + // run CI on bare metal, and produce a good quality solution because the + // final train is still on the full training set. + + test::CRandomNumbers rng; + double noiseVariance{100.0}; + std::size_t trainRows{10000}; + std::size_t testRows{200}; + std::size_t rows{trainRows + testRows}; + std::size_t cols{6}; + + auto target = [&] { + TDoubleVec m; + TDoubleVec s; + rng.generateUniformSamples(0.0, 10.0, cols - 1, m); + rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); + return [=](const TRowRef& row) { + double result{0.0}; + for (std::size_t i = 0; i < cols - 1; ++i) { + result += m[i] + s[i] * row[i]; + } + return result; + }; + }(); + + auto frame = core::makeMainStorageDataFrame(cols, rows).first; + + TDoubleVecVec x(cols - 1); + for (std::size_t i = 0; i < cols - 1; ++i) { + rng.generateUniformSamples(0.0, 10.0, rows, x[i]); + } + + TDoubleVec noise; + rng.generateNormalSamples(0.0, noiseVariance, rows, noise); + + fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame); + + auto regression = maths::CBoostedTreeFactory::constructFromParameters( + 1, std::make_unique()) + .trainFractionPerFold(0.05) + .buildFor(*frame, cols - 1); + + core::CStopWatch timer{true}; + regression->train(); + regression->predict(); + LOG_DEBUG(<< "train duration " << timer.stop() << "ms"); + + double bias; + double rSquared; + std::tie(bias, rSquared) = computeEvaluationMetrics( + *frame, trainRows, rows, + [&](const TRowRef& row_) { return regression->readPrediction(row_)[0]; }, + target, noiseVariance / static_cast(rows)); + + // Unbiased... + BOOST_REQUIRE_CLOSE_ABSOLUTE( + 0.0, bias, 4.0 * std::sqrt(noiseVariance / static_cast(trainRows))); + // Good R^2... + BOOST_TEST_REQUIRE(rSquared > 0.98); +} + BOOST_AUTO_TEST_CASE(testThreading) { // Test we get the same results whether we run with multiple threads or not. diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc index 6dc7c8b09d..07394afae1 100644 --- a/lib/maths/unittest/CDataFrameUtilsTest.cc +++ b/lib/maths/unittest/CDataFrameUtilsTest.cc @@ -507,7 +507,9 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) { maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks; std::tie(trainingRowMasks, testingRowMasks, std::ignore) = maths::CDataFrameUtils::stratifiedCrossValidationRowMasks( - 1, *frame, 0, rng, numberFolds[0], numberBins, allTrainingRowsMask); + 1, *frame, 0, rng, numberFolds[0], + 1.0 - 1.0 / static_cast(numberFolds[0]), numberBins, + allTrainingRowsMask); BOOST_REQUIRE_EQUAL(numberFolds[0], trainingRowMasks.size()); BOOST_REQUIRE_EQUAL(numberFolds[0], testingRowMasks.size()); @@ -564,7 +566,9 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) { maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks; std::tie(std::ignore, testingRowMasks, std::ignore) = maths::CDataFrameUtils::stratifiedCrossValidationRowMasks( - 1, *frame, 0, rng, numberFolds[0], numberBins, allTrainingRowsMask); + 1, *frame, 0, rng, numberFolds[0], + 1.0 - 1.0 / static_cast(numberFolds[0]), numberBins, + allTrainingRowsMask); TDoubleVecVec targetDecile(numberFolds[0], TDoubleVec(numberBins)); From 04248eeb94700f76044f99bcb25124700f31d340 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 10:37:36 +0100 Subject: [PATCH 05/35] Adjust the validation loss variance estimate to remove affects of sampling bias --- include/maths/CBayesianOptimisation.h | 9 ++++++-- include/maths/CBoostedTreeFactory.h | 2 +- include/maths/CBoostedTreeImpl.h | 3 +++ lib/maths/CBayesianOptimisation.cc | 14 +++++++++---- lib/maths/CBoostedTreeImpl.cc | 29 ++++++++++++++++++++++++-- lib/maths/unittest/CBoostedTreeTest.cc | 5 ++--- 6 files changed, 50 insertions(+), 12 deletions(-) diff --git a/include/maths/CBayesianOptimisation.h b/include/maths/CBayesianOptimisation.h index 36f9bc891a..bbec6489f9 100644 --- a/include/maths/CBayesianOptimisation.h +++ b/include/maths/CBayesianOptimisation.h @@ -72,6 +72,10 @@ class MATHS_EXPORT CBayesianOptimisation { //! variance in the error in \p fx w.r.t. the true value is \p vx. void add(TVector x, double fx, double vx); + //! Any portion of the variance of the function error which is explained and + //! shouldn't be included in the kernel. + void explainedErrorVariance(double vx); + //! Get the bounding box (in the function domain) in which we're minimizing. std::pair boundingBox() const; @@ -170,8 +174,9 @@ class MATHS_EXPORT CBayesianOptimisation { private: CPRNG::CXorOShiro128Plus m_Rng; std::size_t m_Restarts; - double m_RangeShift = 0.0; - double m_RangeScale = 1.0; + double m_RangeShift{0.0}; + double m_RangeScale{1.0}; + double m_ExplainedErrorVariance{0.0}; TVector m_MinBoundary; TVector m_MaxBoundary; TVectorDoublePrVec m_FunctionMeanValues; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index ca06cdf0f2..4c18a61354 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -279,7 +279,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { TOptionalSize m_BayesianOptimisationRestarts; bool m_StratifyRegressionCrossValidation = true; double m_InitialDownsampleRowsPerFeature = 200.0; - std::size_t m_MaximumNumberOfTrainRows = 750000; + std::size_t m_MaximumNumberOfTrainRows = 500000; double m_GainPerNode1stPercentile = 0.0; double m_GainPerNode50thPercentile = 0.0; double m_GainPerNode90thPercentile = 0.0; diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index a2d948aa33..e102529607 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -290,6 +290,9 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Compute the mean of the loss function on the masked rows of \p frame. double meanLoss(const core::CDataFrame& frame, const core::CPackedBitVector& rowMask) const; + //! Compute the overall variance of the error we see between folds. + double betweenFoldTestLossVariance() const; + //! Get the root node of \p tree. static const CBoostedTreeNode& root(const TNodeVec& tree); diff --git a/lib/maths/CBayesianOptimisation.cc b/lib/maths/CBayesianOptimisation.cc index 2d6705be10..457249c03d 100644 --- a/lib/maths/CBayesianOptimisation.cc +++ b/lib/maths/CBayesianOptimisation.cc @@ -30,8 +30,9 @@ namespace ml { namespace maths { namespace { -const std::string VERSION_7_5_TAG{"7.5"}; +using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; +const std::string VERSION_7_5_TAG{"7.5"}; const std::string MIN_BOUNDARY_TAG{"min_boundary"}; const std::string MAX_BOUNDARY_TAG{"max_boundary"}; const std::string ERROR_VARIANCES_TAG{"error_variances"}; @@ -106,6 +107,10 @@ void CBayesianOptimisation::add(TVector x, double fx, double vx) { m_ErrorVariances.push_back(CTools::pow2(m_RangeScale) * vx); } +void CBayesianOptimisation::explainedErrorVariance(double vx) { + m_ExplainedErrorVariance = CTools::pow2(m_RangeScale) * vx; +} + std::pair CBayesianOptimisation::boundingBox() const { return {m_MinBoundary, m_MaxBoundary}; @@ -114,7 +119,6 @@ CBayesianOptimisation::boundingBox() const { std::pair CBayesianOptimisation::maximumExpectedImprovement() { - using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TMinAccumulator = CBasicStatistics::COrderStatisticsHeap>; @@ -583,6 +587,7 @@ void CBayesianOptimisation::precondition() { for (auto& variance : m_ErrorVariances) { variance /= CTools::pow2(m_RangeScale); } + m_ExplainedErrorVariance /= CTools::pow2(m_RangeScale); TMeanVarAccumulator rangeMoments; for (const auto& value : m_FunctionMeanValues) { @@ -599,6 +604,7 @@ void CBayesianOptimisation::precondition() { for (auto& variance : m_ErrorVariances) { variance *= CTools::pow2(m_RangeScale); } + m_ExplainedErrorVariance *= CTools::pow2(m_RangeScale); } CBayesianOptimisation::TVector CBayesianOptimisation::function() const { @@ -610,10 +616,10 @@ CBayesianOptimisation::TVector CBayesianOptimisation::function() const { } double CBayesianOptimisation::meanErrorVariance() const { - using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; TMeanAccumulator variance; variance.add(m_ErrorVariances); - return CBasicStatistics::mean(variance); + return CBasicStatistics::mean(variance) - + std::min(m_ExplainedErrorVariance, 0.99 * CBasicStatistics::mean(variance)); } CBayesianOptimisation::TMatrix CBayesianOptimisation::dKerneld(const TVector& a, int k) const { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 93d2560434..0340b4f47f 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -1278,6 +1279,20 @@ double CBoostedTreeImpl::meanLoss(const core::CDataFrame& frame, return CBasicStatistics::mean(loss); } +double CBoostedTreeImpl::betweenFoldTestLossVariance() const { + TMeanVarAccumulator result; + for (const auto& testLosses : m_FoldRoundTestLosses) { + TMeanAccumulator meanTestLoss; + for (std::size_t i = 0; i <= m_CurrentRound; ++i) { + if (testLosses[i] != boost::none) { + meanTestLoss.add(*testLosses[i]); + } + } + result.add(CBasicStatistics::mean(meanTestLoss)); + } + return CBasicStatistics::maximumLikelihoodVariance(result); +} + CBoostedTreeNode& CBoostedTreeImpl::root(TNodeVec& tree) { return tree[0]; } @@ -1358,13 +1373,23 @@ bool CBoostedTreeImpl::selectNextHyperparameters(const TMeanVarAccumulator& loss double meanLoss{CBasicStatistics::mean(lossMoments)}; double lossVariance{CBasicStatistics::variance(lossMoments)}; - LOG_TRACE(<< "round = " << m_CurrentRound << " loss = " << meanLoss << " variance = " - << lossVariance << ": regularization = " << m_Regularization.print() + LOG_TRACE(<< "round = " << m_CurrentRound << ", loss = " << meanLoss + << ", total variance = " << lossVariance + << ", explained variance = " << this->betweenFoldTestLossVariance()); + LOG_TRACE(<< "regularization = " << m_Regularization.print() << ", downsample factor = " << m_DownsampleFactor << ", eta = " << m_Eta << ", eta growth rate per tree = " << m_EtaGrowthRatePerTree << ", feature bag fraction = " << m_FeatureBagFraction); bopt.add(parameters, meanLoss, lossVariance); + // One fold might have examples which are harder to predict on average than + // another fold, particularly if the sample size is small. What we really care + // about is the variation between fold loss values after accounting for any + // systematic effect due to sampling. Running for multiple rounds allows us + // to estimate this effect and we remove it when characterising the uncertainty + // in the loss values in the Gaussian Process. + bopt.explainedErrorVariance(this->betweenFoldTestLossVariance()); + if (m_CurrentRound < m_HyperparameterSamples.size()) { std::copy(m_HyperparameterSamples[m_CurrentRound].begin(), m_HyperparameterSamples[m_CurrentRound].end(), parameters.data()); diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 77bdbc19e8..077f16329a 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -704,7 +704,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) { std::size_t trainRows{10000}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; - std::size_t cols{6}; + std::size_t cols{8}; auto target = [&] { TDoubleVec m; @@ -720,8 +720,6 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) { }; }(); - auto frame = core::makeMainStorageDataFrame(cols, rows).first; - TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); @@ -730,6 +728,7 @@ BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) { TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); + auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame); auto regression = maths::CBoostedTreeFactory::constructFromParameters( From f72dd4cdf189d5564b1c9f17cb6423c6c5dbd35e Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 10:58:13 +0100 Subject: [PATCH 06/35] Formatting --- lib/maths/unittest/CLowessTest.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc index cd3073949b..7c3a44634b 100644 --- a/lib/maths/unittest/CLowessTest.cc +++ b/lib/maths/unittest/CLowessTest.cc @@ -53,8 +53,7 @@ BOOST_AUTO_TEST_CASE(testInvariants) { [&](double x) { return scale[0] * x / 10.0; }, [&](double x) { return scale[0] * (x - offset[0]) * (x - offset[0]) / 100.0; - } - }; + }}; for (std::size_t i = 0; i < 100; ++i) { From fc0a3bca50c4c43b31fb47dba13e06dc290c53ff Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 12:05:18 +0100 Subject: [PATCH 07/35] Docs --- docs/CHANGELOG.asciidoc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 173a2ab48e..a414fc85c8 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -47,6 +47,13 @@ * Ensure bucket `event_count` is calculated for jobs with 1 second bucket spans. (See {ml-pull}1908[#1908].) +== {es} version 7.15.0 + +=== Enhancements + +* Speed up training of regression and classification models on very large data sets. + (See {ml-pull}1941[#1941].) + == {es} version 7.14.0 === Enhancements From 78f6e379f3ec0c7adbe7839650cbee02e6088893 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 13:35:06 +0100 Subject: [PATCH 08/35] Avoid infinite loop --- lib/maths/CBoostedTreeImpl.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 0340b4f47f..7cc581cd0b 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -716,6 +716,9 @@ CBoostedTreeImpl::downsample(const core::CPackedBitVector& trainingRowMask) cons // curvatures for each tree we train. The sampling scheme should minimize // the correlation with previous trees for fixed sample size so randomly // sampling without replacement is appropriate. + if (trainingRowMask.manhattan() == 0.0) { + return trainingRowMask; + } core::CPackedBitVector result; do { result = core::CPackedBitVector{}; From caa7c8241485b5c93d8d025026da1f183202cdfd Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 15:48:27 +0100 Subject: [PATCH 09/35] Correct handling of eta growth rate per tree --- lib/maths/CBoostedTreeFactory.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 821991d496..b52706c6b2 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -460,6 +460,8 @@ void CBoostedTreeFactory::initializeHyperparametersSetup(core::CDataFrame& frame if (m_TreeImpl->m_EtaOverride == boost::none) { m_TreeImpl->m_Eta = computeEta(frame.numberColumns() - this->numberExtraColumnsForTrain()); + } + if (m_TreeImpl->m_EtaGrowthRatePerTreeOverride == boost::none) { m_TreeImpl->m_EtaGrowthRatePerTree = 1.0 + m_TreeImpl->m_Eta / 2.0; } From b46c76ed2abb4988b03a166c7b0ee9a5f6bf5e4a Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 15:53:39 +0100 Subject: [PATCH 10/35] Correct edge case test --- lib/maths/unittest/CBoostedTreeTest.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 077f16329a..2e62a00020 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -431,8 +431,8 @@ BOOST_AUTO_TEST_CASE(testEdgeCases) { auto frame = core::makeMainStorageDataFrame(cols).first; - fillDataFrame(2, 0, 2, {{1.0}, {1.0}}, {0.0, 0.0}, - [](const TRowRef&) { return 1.0; }, *frame); + fillDataFrame(5, 0, 2, {{1.0}, {1.0}, {1.0}, {1.0}, {1.0}}, + {0.0, 0.0, 0.0, 0.0, 0.0}, [](const TRowRef&) { return 1.0; }, *frame); try { auto regression = maths::CBoostedTreeFactory::constructFromParameters( From 7318193fa66ef6a87c54dc721003f7c1fb7876f5 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 2 Jul 2021 16:08:11 +0100 Subject: [PATCH 11/35] Test threshold --- lib/maths/unittest/CBoostedTreeTest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 2e62a00020..e1c13a0abd 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -1417,7 +1417,7 @@ BOOST_AUTO_TEST_CASE(testBinomialLogisticRegression) { LOG_DEBUG(<< "log relative error = " << maths::CBasicStatistics::mean(logRelativeError)); - BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.70); + BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.71); meanLogRelativeError.add(maths::CBasicStatistics::mean(logRelativeError)); } From e55ea4175f4b3991cec9a064697c2cb9328053e5 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 5 Jul 2021 10:51:09 +0100 Subject: [PATCH 12/35] Handle the case we can't sample train/test folds without replacement and unit test --- lib/maths/CDataFrameUtils.cc | 101 +++++++++++++--------- lib/maths/unittest/CDataFrameUtilsTest.cc | 37 +++++++- 2 files changed, 94 insertions(+), 44 deletions(-) diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index 4b5cc3cd94..10a43a9b53 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -122,10 +122,10 @@ class CStratifiedSampler { TRowSamplerVec m_Samplers; TSamplerSelector m_Selector; }; -using TStratifiedSamplerPtr = std::unique_ptr; +using TStratifiedSamplerUPtr = std::unique_ptr; //! Get a classifier stratified row sampler for cross fold validation. -std::pair +std::pair classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads, const core::CDataFrame& frame, std::size_t targetColumn, @@ -153,7 +153,7 @@ classifierStratifiedCrossValidationRowSampler(std::size_t numberThreads, } //! Get a regression stratified row sampler for cross fold validation. -TStratifiedSamplerPtr +TStratifiedSamplerUPtr regressionStratifiedCrossValiationRowSampler(std::size_t numberThreads, const core::CDataFrame& frame, std::size_t targetColumn, @@ -498,8 +498,6 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, double trainFractionPerFold, std::size_t numberBuckets, const core::CPackedBitVector& allTrainingRowsMask) { - TDoubleVec frequencies; - TStratifiedSamplerPtr sampler; double numberTrainingRows{allTrainingRowsMask.manhattan()}; if (static_cast(numberTrainingRows) < numberFolds) { @@ -507,53 +505,72 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, return {{}, {}, {}}; } + double sampleFraction{std::min(trainFractionPerFold, 1.0 - trainFractionPerFold)}; + double excessSampleFraction{ + std::max(sampleFraction - 1.0 / static_cast(numberFolds), 0.0)}; + // We sample the smaller of the test or train set in the loop. + std::size_t excessSampleSize{static_cast( + std::ceil(excessSampleFraction * numberTrainingRows))}; std::size_t sampleSize{static_cast( - std::min(trainFractionPerFold, 1.0 - trainFractionPerFold) * numberTrainingRows + 0.5)}; - double minimumSizeToSample{static_cast(sampleSize + numberFolds)}; - LOG_TRACE(<< "sample size = " << sampleSize); + (sampleFraction - excessSampleFraction) * numberTrainingRows)}; + LOG_TRACE(<< "excess sample size = " << excessSampleSize + << ", sample size = " << sampleSize); - if (frame.columnIsCategorical()[targetColumn]) { - std::tie(sampler, frequencies) = classifierStratifiedCrossValidationRowSampler( - numberThreads, frame, targetColumn, rng, sampleSize, allTrainingRowsMask); - } else { - sampler = regressionStratifiedCrossValiationRowSampler( - numberThreads, frame, targetColumn, rng, sampleSize, numberBuckets, - allTrainingRowsMask); - } + TDoubleVec frequencies; + + auto makeSampler = [&](std::size_t size) { + TStratifiedSamplerUPtr result; + if (size > 0) { + if (frame.columnIsCategorical()[targetColumn]) { + std::tie(result, frequencies) = classifierStratifiedCrossValidationRowSampler( + numberThreads, frame, targetColumn, rng, size, allTrainingRowsMask); + } else { + result = regressionStratifiedCrossValiationRowSampler( + numberThreads, frame, targetColumn, rng, size, + numberBuckets, allTrainingRowsMask); + } + } + return result; + }; + + auto excessSampler = makeSampler(excessSampleSize); + auto sampler = makeSampler(sampleSize); LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan()); TPackedBitVectorVec testingRowMasks(numberFolds); TSizeVec rowIndices; - core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask}; - for (std::size_t fold = 0; fold < testingRowMasks.size(); ++fold) { - if (candidateTestingRowsMask.manhattan() < minimumSizeToSample) { - testingRowMasks[fold] = candidateTestingRowsMask; - } else { - frame.readRows(1, 0, frame.numberRows(), - [&](const TRowItr& beginRows, const TRowItr& endRows) { - for (auto row = beginRows; row != endRows; ++row) { - sampler->sample(*row); - } - }, - &candidateTestingRowsMask); - sampler->finishSampling(rng, rowIndices); - std::sort(rowIndices.begin(), rowIndices.end()); - LOG_TRACE(<< "# row indices = " << rowIndices.size()); - - for (auto row : rowIndices) { - testingRowMasks[fold].extend(false, row - testingRowMasks[fold].size()); - testingRowMasks[fold].extend(true); - } - testingRowMasks[fold].extend(false, allTrainingRowsMask.size() - - testingRowMasks[fold].size()); + auto sample = [&](const TStratifiedSamplerUPtr& sampler_, + const core::CPackedBitVector& candidateTestingRowsMask) { + frame.readRows(1, 0, frame.numberRows(), + [&](const TRowItr& beginRows, const TRowItr& endRows) { + for (auto row = beginRows; row != endRows; ++row) { + sampler_->sample(*row); + } + }, + &candidateTestingRowsMask); + sampler_->finishSampling(rng, rowIndices); + std::sort(rowIndices.begin(), rowIndices.end()); + LOG_TRACE(<< "# row indices = " << rowIndices.size()); + + core::CPackedBitVector result; + for (auto row : rowIndices) { + result.extend(false, row - result.size()); + result.extend(true); } + result.extend(false, allTrainingRowsMask.size() - result.size()); + return result; + }; - // We exclusive or here to remove the rows we've selected for the current - // test/train fold. This is equivalent to sampling without replacement. - candidateTestingRowsMask ^= testingRowMasks[fold]; + core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask}; + for (auto& testingRowMask : testingRowMasks) { + testingRowMask = sample(sampler, candidateTestingRowsMask); + candidateTestingRowsMask ^= testingRowMask; + if (excessSampler != nullptr) { + testingRowMask |= sample(excessSampler, allTrainingRowsMask ^ testingRowMask); + } } TPackedBitVectorVec trainingRowMasks{complementRowMasks(testingRowMasks, allTrainingRowsMask)}; @@ -1091,7 +1108,7 @@ CDataFrameUtils::maximizeMinimumRecallForMulticlass(std::size_t numberThreads, // No need to sample if were going to use every row we've been given. if (numberSamples < static_cast(rowMask.manhattan())) { - TStratifiedSamplerPtr sampler; + TStratifiedSamplerUPtr sampler; std::tie(sampler, std::ignore) = classifierStratifiedCrossValidationRowSampler( numberThreads, frame, targetColumn, rng, numberSamples, rowMask); diff --git a/lib/maths/unittest/CDataFrameUtilsTest.cc b/lib/maths/unittest/CDataFrameUtilsTest.cc index 07394afae1..748a5aedc4 100644 --- a/lib/maths/unittest/CDataFrameUtilsTest.cc +++ b/lib/maths/unittest/CDataFrameUtilsTest.cc @@ -467,8 +467,10 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) { // 2) The test masks are disjoint for each fold, // 3) The train and test masks are disjoint for a given fold, // 4) They're all subsets of the initial mask supplied, - // 5) The number of examples in each category per fold is proportional to - // their overall frequency. + // 5) The number of examples in each category per fold is proportional + // to their overall frequency. + // 6) Test we get the correct size masks if we are using more or less + // training data than implied by k-fold cross-validation. using TDoubleDoubleUMap = boost::unordered_map; @@ -609,6 +611,37 @@ BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) { BOOST_TEST_REQUIRE(maths::CBasicStatistics::variance(testTargetDecileMoments) < 0.02); } } + + for (auto fraction : {0.1, 0.4}) { + TDoubleVec categories; + testRng.generateNormalSamples(0.0, 3.0, numberRows, categories); + + auto frame = core::makeMainStorageDataFrame(numberCols).first; + frame->categoricalColumns(TBoolVec{true}); + for (std::size_t i = 0; i < numberRows; ++i) { + frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { + *column = std::floor(std::fabs(categories[i])); + }); + } + frame->finishWritingRows(); + + core::CPackedBitVector allTrainingRowsMask{numberRows, true}; + + maths::CDataFrameUtils::TPackedBitVectorVec trainingRowMasks; + maths::CDataFrameUtils::TPackedBitVectorVec testingRowMasks; + std::tie(trainingRowMasks, testingRowMasks, std::ignore) = + maths::CDataFrameUtils::stratifiedCrossValidationRowMasks( + 1, *frame, 0, rng, 3, fraction, numberBins, allTrainingRowsMask); + + BOOST_REQUIRE_EQUAL(trainingRowMasks.size(), testingRowMasks.size()); + for (std::size_t i = 0; i < trainingRowMasks.size(); ++i) { + BOOST_REQUIRE_EQUAL( + numberRows, static_cast( + (trainingRowMasks[i] | testingRowMasks[i]).manhattan())); + BOOST_REQUIRE_EQUAL(fraction, trainingRowMasks[i].manhattan() / + static_cast(numberRows)); + } + } } BOOST_AUTO_TEST_CASE(testMicWithColumn) { From dd002c338e873de41e7e618e879764474dd4f659 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 7 Jul 2021 10:17:02 +0100 Subject: [PATCH 13/35] Handle edge case creating train/test splits with very little data --- lib/maths/CDataFrameUtils.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index 10a43a9b53..dd074eacc3 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -501,7 +501,7 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, double numberTrainingRows{allTrainingRowsMask.manhattan()}; if (static_cast(numberTrainingRows) < numberFolds) { - HANDLE_FATAL(<< "Input error: unsufficient training data provided."); + HANDLE_FATAL(<< "Input error: insufficient training data provided."); return {{}, {}, {}}; } @@ -512,8 +512,8 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, // We sample the smaller of the test or train set in the loop. std::size_t excessSampleSize{static_cast( std::ceil(excessSampleFraction * numberTrainingRows))}; - std::size_t sampleSize{static_cast( - (sampleFraction - excessSampleFraction) * numberTrainingRows)}; + std::size_t sampleSize{static_cast(std::max( + (1.0 + 1e-8) * (sampleFraction - excessSampleFraction) * numberTrainingRows, 1.0))}; LOG_TRACE(<< "excess sample size = " << excessSampleSize << ", sample size = " << sampleSize); @@ -536,6 +536,10 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, auto excessSampler = makeSampler(excessSampleSize); auto sampler = makeSampler(sampleSize); + if (sampler == nullptr) { + HANDLE_FATAL(<< "Internal error: failed to create train/test splits."); + return {{}, {}, {}}; + } LOG_TRACE(<< "number training rows = " << allTrainingRowsMask.manhattan()); From 37d469085955abea89ed6da3631557d873930721 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 7 Jul 2021 13:00:49 +0100 Subject: [PATCH 14/35] Slightly relax tests to pass on all platforms --- ...CDataFrameAnalyzerFeatureImportanceTest.cc | 48 +++++++++++++------ lib/maths/unittest/CBoostedTreeTest.cc | 4 +- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc index adc367a4f7..d22a808c88 100644 --- a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc @@ -544,9 +544,14 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) { TMeanAccumulator c2TotalShapExpected; TMeanAccumulator c3TotalShapExpected; TMeanAccumulator c4TotalShapExpected; - double c1Sum{0.0}, c2Sum{0.0}, c3Sum{0.0}, c4Sum{0.0}; - double c1TotalShapActual{0.0}, c2TotalShapActual{0.0}, - c3TotalShapActual{0.0}, c4TotalShapActual{0.0}; + double c1Sum{0.0}; + double c2Sum{0.0}; + double c3Sum{0.0}; + double c4Sum{0.0}; + double c1TotalShapActual{0.0}; + double c2TotalShapActual{0.0}; + double c3TotalShapActual{0.0}; + double c4TotalShapActual{0.0}; bool hasTotalFeatureImportance{false}; double baseline{readBaselineValue(results)}; for (const auto& result : results.GetArray()) { @@ -581,9 +586,12 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) { } } - // since target is generated using the linear model - // 50 c1 + 150 c2 + 50 c3 - 50 c4, with c1 categorical {-10,10} - // we expect c2 > c1 > c3 \approx c4 + // Since the target is generated using the linear model + // + // 50 c1 + 150 c2 + 50 c3 - 50 c4, with c1 categorical {-10,10} + // + // we expect c2 > c1 > c3 \approx c4. + BOOST_TEST_REQUIRE(c2Sum > c1Sum); // since c1 is categorical -10 or 10, it's influence is generally higher than that of c3 and c4 which are sampled // randomly on [-10, 10]. @@ -642,15 +650,20 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) { // values are indeed a local approximation of the predicted log-odds. std::size_t topShapValues{4}; - auto resultsPair{runBinaryClassification(topShapValues, {0.5, -0.7, 0.2, -0.2})}; + auto resultsPair{runBinaryClassification(topShapValues, {0.5, -0.7, 0.3, -0.3})}; auto results{std::move(resultsPair.first)}; TMeanAccumulator c1TotalShapExpected; TMeanAccumulator c2TotalShapExpected; TMeanAccumulator c3TotalShapExpected; TMeanAccumulator c4TotalShapExpected; - double c1Sum{0.0}, c2Sum{0.0}, c3Sum{0.0}, c4Sum{0.0}; - double c1TotalShapActual[2], c2TotalShapActual[2], c3TotalShapActual[2], - c4TotalShapActual[2]; + double c1Sum{0.0}; + double c2Sum{0.0}; + double c3Sum{0.0}; + double c4Sum{0.0}; + double c1TotalShapActual[2]; + double c2TotalShapActual[2]; + double c3TotalShapActual[2]; + double c4TotalShapActual[2]; bool hasTotalFeatureImportance{false}; double baselineFoo{readBaselineValue(results, "foo")}; double baselineBar{readBaselineValue(results, "bar")}; @@ -698,13 +711,20 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) { } } - // since the target using a linear model - // 0.5 c1 + 0.7 c2 + 0.25 c3 - 0.25 c4 - // to generate the log odds we expect c2 > c1 > c3 \approx c4 + // Since the target is using the linear model + // + // 0.5 c1 - 0.7 c2 + 0.2 c3 - 0.2 c4 + // + // to generate the log odds we expect c2 > c1 > c3 \approx c4. + + LOG_DEBUG(<< "c1Sum = " << c1Sum << ", c2Sum = " << c2Sum + << ", c3Sum = " << c3Sum << ", c4Sum = " << c4Sum); + BOOST_TEST_REQUIRE(c2Sum > c1Sum); BOOST_TEST_REQUIRE(c1Sum > c3Sum); BOOST_TEST_REQUIRE(c1Sum > c4Sum); - BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 40.0); // c3 and c4 within 40% of each other + BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 20.0); // c3 and c4 within 20% of each other + BOOST_TEST_REQUIRE(hasTotalFeatureImportance); for (std::size_t i = 0; i < classes.size(); ++i) { if (c1TotalShapActual[i] == 0 || c2TotalShapActual[i] == 0 || diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index e1c13a0abd..3bc9167602 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -548,7 +548,7 @@ BOOST_AUTO_TEST_CASE(testLinear) { // Unbiased... BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, modelBias[i][0], - 4.0 * std::sqrt(noiseVariance / static_cast(trainRows))); + 6.0 * std::sqrt(noiseVariance / static_cast(trainRows))); // Good R^2... BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97); @@ -1417,7 +1417,7 @@ BOOST_AUTO_TEST_CASE(testBinomialLogisticRegression) { LOG_DEBUG(<< "log relative error = " << maths::CBasicStatistics::mean(logRelativeError)); - BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.71); + BOOST_TEST_REQUIRE(maths::CBasicStatistics::mean(logRelativeError) < 0.77); meanLogRelativeError.add(maths::CBasicStatistics::mean(logRelativeError)); } From 28f22f43582dbb52b22b70e0ac252f642d383588 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 14:33:48 +0100 Subject: [PATCH 15/35] Review comments --- include/maths/CLowess.h | 17 ++++-------- include/maths/CLowessDetail.h | 35 ----------------------- lib/maths/unittest/CLowessTest.cc | 46 ++++++++++--------------------- 3 files changed, 20 insertions(+), 78 deletions(-) diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h index c440f497f8..16691d029b 100644 --- a/include/maths/CLowess.h +++ b/include/maths/CLowess.h @@ -34,7 +34,8 @@ class CLowess { //! //! \param[in] data The training data. //! \param[in] numberFolds The number of folds to use in cross-validation to - // compute the best weight function from the family exp(-k |xi - xj|). + //! compute the best weight function from the family exp(-k |xi - xj|) with + //! k a free parameter which determines the amount of smoothing to use. void fit(TDoubleDoublePrVec data, std::size_t numberFolds); //! Predict the value at \p x. @@ -52,15 +53,6 @@ class CLowess { //! \note Defined as zero if no data have been fit. double residualVariance() const; - //! Compute the sublevel set of \p f containing \p xmin. - //! - //! \param[in] xmin The argument of the minimum of the interpolated function. - //! \param[in] fmin The value of the minimum of the function. - //! \param[in] f The value of the function for which to compute the sublevel set. - //! \note \p f should be greater than fmin. - //! \note Defined as (0,0) if no data have been fit. - TDoubleDoublePr sublevelSet(double xmin, double fmin, double f) const; - //! Get how far we are prepared to extrapolate as the interval we will search //! in the minimum and sublevelSet functions. TDoubleDoublePr extrapolationInterval() const; @@ -81,7 +73,10 @@ class CLowess { private: TDoubleDoublePrVec m_Data; TSizeVec m_Mask; - double m_K = 0.0; + //! The weight to assign to data points when fitting polynomial at x is given + //! by exp(-k |xi - xj|). This can therefore be thought of as the inverse of + //! the amount of smoothing. + double m_K{0.0}; }; } } diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index 1c6e5be6ed..c9cd96e9bb 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -165,41 +165,6 @@ double CLowess::residualVariance() const { return CBasicStatistics::variance(moments); } -template -typename CLowess::TDoubleDoublePr -CLowess::sublevelSet(double xmin, double fmin, double f) const { - - if (m_Data.empty()) { - return {0.0, 0.0}; - } - if (f <= fmin) { - return {xmin, xmin}; - } - - auto solve = [&](double n, double stop) { - double fx{fmin}; - for (double i = 1.0; i <= n; i += 1.0) { - double xlast{((i - 1.0) * stop + (n - i + 1.0) * xmin) / n}; - double x{(i * stop + (n - i) * xmin) / n}; - double flast{fx}; - fx = this->predict(x); - if (fx > f) { - return CTools::linearlyInterpolate(flast, fx, xlast, x, f); - } - } - return stop; - }; - - double xa, xb; - std::tie(xa, xb) = this->extrapolationInterval(); - double alpha{(xmin - xa) / (xb - xa)}; - double beta{1.0 - alpha}; - LOG_TRACE(<< "alpha = " << alpha << ", beta = " << beta); - - return {solve(std::ceil(alpha * 40.0), xa), - solve(std::ceil((1.0 - alpha) * 40.0), xb)}; -} - template typename CLowess::TDoubleDoublePr CLowess::extrapolationInterval() const { double xa{m_Data.front().first}; diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc index 7c3a44634b..61c4bec0fc 100644 --- a/lib/maths/unittest/CLowessTest.cc +++ b/lib/maths/unittest/CLowessTest.cc @@ -80,18 +80,6 @@ BOOST_AUTO_TEST_CASE(testInvariants) { BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea))); BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::min(xmin + 0.1, xeb))); - double xa, xb; - std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, fmin + 0.1); - BOOST_TEST_REQUIRE(xa <= xmin); - BOOST_TEST_REQUIRE(xb >= xmin); - - BOOST_TEST_REQUIRE(xmin >= xea); - BOOST_TEST_REQUIRE(xmin <= xeb); - BOOST_TEST_REQUIRE(xa >= xea); - BOOST_TEST_REQUIRE(xb <= xeb); - BOOST_TEST_REQUIRE(xa >= xea); - BOOST_TEST_REQUIRE(xb <= xeb); - TMeanVarAccumulator residualMoments; for (const auto& x : data) { residualMoments.add(x.second - lowess.predict(x.first)); @@ -202,6 +190,7 @@ BOOST_AUTO_TEST_CASE(testTrainingLossCurves) { // 2. Car-parts // 3. Boston + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; using TDoubleDoublePrVecVec = std::vector::TDoubleDoublePrVec>; // clang-format off @@ -232,30 +221,23 @@ BOOST_AUTO_TEST_CASE(testTrainingLossCurves) { {-0.1069061, 9.885219}}, {{-3.800451, 8.317797}, {-3.738576, 8.053429}, {-3.403612, 8.338234}, {-2.801874, 8.890816}, {-2.333564, 8.705093}, {-2.208987, 10.69139}, {-1.803296, 9.234116}, {-1.002829, 10.67219}, {-0.9090844, 12.46085}, {-0.804719, 13.98731}}}; + + // Check against judged minimum for each curve. + TDoubleVec preferredXmin{9.5, 1.7, -0.64, 3.6, -0.1, -0.25, 11.0, 5.0, 2.0, -0.1, -0.2, -2.3, -0.8, 5.5, 3.2, -2.3, 2.0, -0.57, -3.6}; // clang-format on - for (const auto& curve : curves) { + TMeanAccumulator meanRelativeError; + for (std::size_t i = 0; i < curves.size(); ++i) { maths::CLowess<2> lowess; - lowess.fit(curve, curve.size()); - double xmin, fmin; - std::tie(xmin, fmin) = lowess.minimum(); - double variance{lowess.residualVariance()}; - - double xa, xb; - double ftarget{fmin + std::sqrt(variance)}; - std::tie(xa, xb) = lowess.sublevelSet(xmin, fmin, ftarget); - - if (xa <= curve.front().first) { - BOOST_TEST_REQUIRE(lowess.predict(xa) <= 1.01 * ftarget); - } else { - BOOST_REQUIRE_CLOSE(lowess.predict(xa), ftarget, 1.0); // 1.0% - } - if (xb >= curve.back().first) { - BOOST_TEST_REQUIRE(lowess.predict(xb) <= 1.01 * ftarget); - } else { - BOOST_REQUIRE_CLOSE(lowess.predict(xb), ftarget, 1.0); // 1.0% - } + lowess.fit(curves[i], curves[i].size()); + double xmin; + std::tie(xmin, std::ignore) = lowess.minimum(); + + meanRelativeError.add(std::fabs(xmin - preferredXmin[i]) / + std::fabs(preferredXmin[i])); } + + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, maths::CBasicStatistics::mean(meanRelativeError), 0.25); } BOOST_AUTO_TEST_SUITE_END() From 4f3e3f9e2e547eb9758297500a31029e6910eac1 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 14:34:56 +0100 Subject: [PATCH 16/35] Review comments --- include/maths/CLowess.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h index 16691d029b..598e8d46f9 100644 --- a/include/maths/CLowess.h +++ b/include/maths/CLowess.h @@ -7,9 +7,8 @@ #ifndef INCLUDED_ml_maths_CLowess_h #define INCLUDED_ml_maths_CLowess_h -#include - #include +#include #include #include From 5d4edbae395d0d3cb18c48c0ae0dc7857dd9c124 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 14:43:02 +0100 Subject: [PATCH 17/35] Explain p. --- include/maths/CLowessDetail.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index c9cd96e9bb..64b9e3ce14 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -49,8 +49,10 @@ void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { // // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } // - // where w = exp(-k (x - X_i)) and (X, Y) are the data to fit. We determine k by - // solving + // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector of + // parameters for the polynomial function, i.e. the coefficients p_0 + p_1 x + ... + // + // We determine k by solving // // k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*)) } } // From 5748ce1cbb0aebd9ce8593160957ae30218d7161 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 14:49:04 +0100 Subject: [PATCH 18/35] Explain poly --- include/maths/CLowessDetail.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index 64b9e3ce14..270c57596c 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -47,10 +47,12 @@ void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { // // f(x | p^*) = poly(x | p^*(x)) // - // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } + // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } (2) // - // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector of - // parameters for the polynomial function, i.e. the coefficients p_0 + p_1 x + ... + // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector + // of parameters for the polynomial function poly(. | p), i.e. the coefficients + // p_0 + p_1 x + p_2 x^2 ... (which are determined by minimizing the weighted + // least square prediction errors as in (2)). // // We determine k by solving // From c252b2475bb2d78e3c3da179f5b5936c2e244c39 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 14:58:12 +0100 Subject: [PATCH 19/35] Add explanation of mechanics of fit --- include/maths/CLowessDetail.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index 270c57596c..3c6ecbad83 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -68,6 +68,12 @@ void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { TSizeVecVec testingMasks; this->setupMasks(numberFolds, trainingMasks, testingMasks); + // Here, we line search different values of m_K. We aim to cover the case we have + // a lot of smoothing, m_K is 0, to the case m_K is large compared to the data + // range so most points have very low weight and don't constrain the polynomial + // parameters. We finish up by polishing up the minimum on the best candidate + // interval using Brent's method. See CSolvers::globalMaximize for details. + TDoubleVec K(17); double range{m_Data.back().first - m_Data.front().first}; for (std::size_t i = 0; i < K.size(); ++i) { From 9a7feea7af5dbe58ade1c40434396be07a8375ab Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 15:02:28 +0100 Subject: [PATCH 20/35] Make k dependency clear --- include/maths/CLowessDetail.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index 3c6ecbad83..d10b083f46 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -56,10 +56,11 @@ void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { // // We determine k by solving // - // k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*)) } } + // k^* = argmin_k{ sum_{Yi in H}{ L(Yi | f(x | p^*(k))) } } // - // where H is a hold out set and we assume Y_i ~ N(poly(X_i | p^*), sigma) with - // sigma estimated from the training data prediction residuals. + // where H is a hold out set and we assume Y_i ~ N(poly(X_i | p^*(k)), sigma) + // with sigma estimated from the training data prediction residuals to compute + // the likelihood function L(Yi | f(x | p^*(k))). m_Mask.resize(m_Data.size()); std::iota(m_Mask.begin(), m_Mask.end(), 0); From 5b1a0183573b3c1663f0ea9a2ffe9fd5a32f42dd Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 15:05:35 +0100 Subject: [PATCH 21/35] Document test interface --- include/maths/CLowess.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h index 598e8d46f9..b79b1a3c11 100644 --- a/include/maths/CLowess.h +++ b/include/maths/CLowess.h @@ -47,6 +47,8 @@ class CLowess { //! \note Defined as (0,0) if no data have been fit. TDoubleDoublePr minimum() const; + //! \name Test Functions + //@{ //! Get an estimate of residual variance at the observed values. //! //! \note Defined as zero if no data have been fit. @@ -55,6 +57,7 @@ class CLowess { //! Get how far we are prepared to extrapolate as the interval we will search //! in the minimum and sublevelSet functions. TDoubleDoublePr extrapolationInterval() const; + //@} private: using TDoubleVec = std::vector; From 93d3264aeef8aefd8ec79fc28161b7089d106a4d Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 16:04:15 +0100 Subject: [PATCH 22/35] Names, explanation and coding style guideline fixes --- include/maths/CLowessDetail.h | 45 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index d10b083f46..80d914126a 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -50,7 +50,7 @@ void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } (2) // // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector - // of parameters for the polynomial function poly(. | p), i.e. the coefficients + // of parameters for the polynomial function poly(. | p), i.e. the coefficients // p_0 + p_1 x + p_2 x^2 ... (which are determined by minimizing the weighted // least square prediction errors as in (2)). // @@ -119,17 +119,19 @@ typename CLowess::TDoubleDoublePr CLowess::minimum() const { TDoubleVec X; - double xa, xb; + double xa; + double xb; std::tie(xa, xb) = this->extrapolationInterval(); // Coarse. X.reserve(m_Data.size() + 2); X.push_back(xa); - for (std::size_t i = 0; i < m_Data.size(); ++i) { - X.push_back(m_Data[i].first); + for (const auto& xi : m_Data) { + X.push_back(xi.first); } X.push_back(xb); - double xmin, fmin; + double xmin; + double fmin; CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xmin, fmin); // Refine. @@ -141,7 +143,8 @@ typename CLowess::TDoubleDoublePr CLowess::minimum() const { for (double x = xa; x < xb; x += dx) { X.push_back(x); } - double xcand, fcand; + double xcand; + double fcand; CSolvers::globalMinimize(X, [&](double x) { return this->predict(x); }, xcand, fcand); if (fcand < fmin) { @@ -166,11 +169,12 @@ double CLowess::residualVariance() const { TSizeVec mask(n); std::iota(mask.begin(), mask.end(), 1); for (std::size_t i = 0; i < n; ++i) { - double xi, yi; + double xi; + double yi; std::tie(xi, yi) = m_Data[i]; auto poly = this->fit(mask.begin(), mask.begin() + n - 1, m_K, xi); moments.add(yi - poly.predict(xi)); - mask[i] = i; + mask[i] -= 1; } return CBasicStatistics::variance(moments); @@ -249,7 +253,7 @@ double CLowess::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMa double result{0.0}; - CNormalMeanPrecConjugate::TDouble1Vec samples; + CNormalMeanPrecConjugate::TDouble1Vec testResiduals; CNormalMeanPrecConjugate::TDoubleWeightsAry1Vec weights; for (std::size_t i = 0; i < trainingMasks.size(); ++i) { @@ -260,8 +264,13 @@ double CLowess::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMa std::size_t last{trainingMasks[i].size() - 1}; for (auto& j : trainingMasks[i]) { - double xj, yj; + double xj; + double yj; std::tie(xj, yj) = m_Data[j]; + // Here we wish to leave out the j'th fold training mask. Since this + // is a vector we do this efficiently by temporarily swaping to the + // back of the collection so we can pass the masks as a contiguous + // range. std::swap(j, trainingMasks[i][last]); auto poly = this->fit(trainingMasks[i].cbegin(), trainingMasks[i].cbegin() + last, k, xj); @@ -270,20 +279,21 @@ double CLowess::likelihood(TSizeVecVec& trainingMasks, TSizeVecVec& testingMa } LOG_TRACE(<< "residual distribution = " << residuals.print()); - samples.clear(); - samples.reserve(testingMasks[i].size()); + testResiduals.clear(); + testResiduals.reserve(testingMasks[i].size()); for (auto j : testingMasks[i]) { - double xj, yj; + double xj; + double yj; std::tie(xj, yj) = m_Data[j]; auto poly = this->fit(trainingMasks[i].cbegin(), trainingMasks[i].cend(), k, xj); - samples.push_back(yj - poly.predict(xj)); + testResiduals.push_back(yj - poly.predict(xj)); } weights.assign(testingMasks[i].size(), maths_t::CUnitWeights::UNIT); - LOG_TRACE(<< "samples = " << samples); + LOG_TRACE(<< "test residuals = " << testResiduals); double likelihood; - residuals.jointLogMarginalLikelihood(samples, weights, likelihood); + residuals.jointLogMarginalLikelihood(testResiduals, weights, likelihood); result += likelihood; } LOG_TRACE(<< "k = " << k << ", likelihood = " << result); @@ -296,7 +306,8 @@ typename CLowess::TPolynomial CLowess::fit(TSizeVecCItr beginMask, TSizeVecCItr endMask, double k, double x) const { TPolynomial poly; for (auto i = beginMask; i != endMask; ++i) { - double xi, yi; + double xi; + double yi; std::tie(xi, yi) = m_Data[*i]; poly.add(xi, yi, this->weight(k, xi, x)); } From ae45379852e4c1642efdcddc08b322fe222494e8 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 16:14:47 +0100 Subject: [PATCH 23/35] Explicit capture --- include/maths/CSolvers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/maths/CSolvers.h b/include/maths/CSolvers.h index 8d55b162dd..ebe8efbb1d 100644 --- a/include/maths/CSolvers.h +++ b/include/maths/CSolvers.h @@ -859,7 +859,7 @@ class MATHS_EXPORT CSolvers { //! \param[out] fx Set to the value of f at \p x. template static bool globalMaximize(const T& p, const F& f, double& x, double& fx) { - auto minusF = [&](double x_) { return -f(x_); }; + auto minusF = [&f](double x_) { return -f(x_); }; bool result{globalMinimize(p, minusF, x, fx)}; fx = -fx; return result; From efdadc0ffdda2d7604ecec687c86b988d341dfba Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 16:17:12 +0100 Subject: [PATCH 24/35] Typo --- lib/maths/CBoostedTreeFactory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index b52706c6b2..2ff19ccc80 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -332,8 +332,8 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const { // // In addition, we want to constrain the maximum amount of training data we'll // use during hyperparameter search to avoid very long run times. To do this - // we use less than the implied 1 - 1/k : 1/k train : test split when it results - // in more train rows than the defined maximum. + // we use less than the implied 1 - 1/k : 1/k for the train : test split when + // it results in more train rows than the defined maximum. double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature * static_cast(frame.numberColumns() - 1)) / From 59c9addbe294616de9aefaaa53264a667f894e56 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 16:23:53 +0100 Subject: [PATCH 25/35] Capture by reference --- include/maths/CSolvers.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/maths/CSolvers.h b/include/maths/CSolvers.h index ebe8efbb1d..4fd45acaf2 100644 --- a/include/maths/CSolvers.h +++ b/include/maths/CSolvers.h @@ -909,7 +909,8 @@ class MATHS_EXPORT CSolvers { std::swap(fa, fb); } - double x, fx; + double x; + double fx; { std::size_t n = maxIterations; minimize(a, b, fa, fb, f, 0.0, n, fc, x, fx); @@ -922,7 +923,7 @@ class MATHS_EXPORT CSolvers { // [a, x] and [b, r] bracket the sublevel set end points. - auto fMinusFc = [=](double x_) { return f(x_) - fc; }; + auto fMinusFc = [&f, fc](double x_) { return f(x_) - fc; }; LOG_TRACE(<< "a = " << a << ", x = " << x << ", b = " << b); LOG_TRACE(<< "f_(a) = " << fa - fc << ", f_(x) = " << fx - fc From 74c27f924e91559d1edc0db7e809b61f181b2b98 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 16:25:58 +0100 Subject: [PATCH 26/35] Rename --- lib/api/CDataFrameTrainBoostedTreeRunner.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc index b10932a5b2..0d6a62cac8 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc @@ -32,7 +32,7 @@ namespace ml { namespace api { namespace { -const std::size_t UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER{ +const std::size_t NUMBER_ROUNDS_PER_HYPERPARAMETER_IS_UNSET{ std::numeric_limits::max()}; } @@ -106,7 +106,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( double trainFractionPerFold{parameters[TRAIN_FRACTION_PER_FOLD].fallback(-1.0)}; std::size_t numberRoundsPerHyperparameter{ parameters[MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER].fallback( - UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER)}; + NUMBER_ROUNDS_PER_HYPERPARAMETER_IS_UNSET)}; std::size_t bayesianOptimisationRestarts{ parameters[BAYESIAN_OPTIMISATION_RESTARTS].fallback(std::size_t{0})}; bool stopCrossValidationEarly{parameters[STOP_CROSS_VALIDATION_EARLY].fallback(true)}; @@ -205,7 +205,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( if (trainFractionPerFold > 0.0) { m_BoostedTreeFactory->trainFractionPerFold(trainFractionPerFold); } - if (numberRoundsPerHyperparameter != UNSET_NUMBER_ROUNDS_PER_HYPERPARAMETER) { + if (numberRoundsPerHyperparameter != NUMBER_ROUNDS_PER_HYPERPARAMETER_IS_UNSET) { m_BoostedTreeFactory->maximumOptimisationRoundsPerHyperparameter(numberRoundsPerHyperparameter); } if (bayesianOptimisationRestarts > 0) { From ca1d910ec35041c75e9340fd37675a0b290fef81 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 16:37:56 +0100 Subject: [PATCH 27/35] Update comment to reflect the current behaviour --- include/maths/CBoostedTreeFactory.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 4c18a61354..ec19a654d3 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -209,9 +209,10 @@ class MATHS_EXPORT CBoostedTreeFactory final { TDoubleDoublePrVec estimateTreeGainAndCurvature(core::CDataFrame& frame, const TDoubleVec& percentiles) const; - //! Perform a line search for the test loss w.r.t. a single regularization - //! hyperparameter and apply Newton's method to find the minimum. The plan - //! is to find a value near where the model starts to overfit. + //! Perform a line search for the test loss w.r.t. a single hyperparameter. + //! At the end we use a smooth curve fit through all test loss values (using + //! LOWESS regression) and use this to get a best estimate of where the true + //! minimum occurs. //! //! \return The interval to search during the main hyperparameter optimisation //! loop or null if this couldn't be found. From 40eae57f73d43f8845f49c4c784ebb2ebe90af24 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 17:31:44 +0100 Subject: [PATCH 28/35] Name variable for readability --- lib/maths/unittest/CLowessTest.cc | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc index 61c4bec0fc..f59227aee4 100644 --- a/lib/maths/unittest/CLowessTest.cc +++ b/lib/maths/unittest/CLowessTest.cc @@ -40,6 +40,8 @@ BOOST_AUTO_TEST_CASE(testInvariants) { test::CRandomNumbers rng; + std::size_t numberFolds{5}; + TDoubleVec scale; TDoubleVec offset; TDoubleVec noise; @@ -69,12 +71,14 @@ BOOST_AUTO_TEST_CASE(testInvariants) { } maths::CLowess<2> lowess; - lowess.fit(data, 5); + lowess.fit(data, numberFolds); - double xea, xeb; + double xea; + double xeb; std::tie(xea, xeb) = lowess.extrapolationInterval(); - double xmin, fmin; + double xmin; + double fmin; std::tie(xmin, fmin) = lowess.minimum(); BOOST_REQUIRE_EQUAL(fmin, lowess.predict(xmin)); BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea))); @@ -96,6 +100,8 @@ BOOST_AUTO_TEST_CASE(testSmooth) { test::CRandomNumbers rng; + std::size_t numberFolds{5}; + auto trend = [](double x) { return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); }; @@ -107,7 +113,7 @@ BOOST_AUTO_TEST_CASE(testSmooth) { } maths::CLowess<2> lowess; - lowess.fit(data, 5); + lowess.fit(data, numberFolds); TMeanVarAccumulator errorMoments; for (std::size_t i = 0; i < 20; ++i) { @@ -125,6 +131,8 @@ BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) { test::CRandomNumbers rng; + std::size_t numberFolds{5}; + TDoubleVec noise; rng.generateNormalSamples(0.0, 4.0, 20, noise); @@ -139,7 +147,7 @@ BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) { } maths::CLowess<2> lowess; - lowess.fit(data, 5); + lowess.fit(data, numberFolds); TMeanVarAccumulator errorMoments; for (std::size_t i = 0; i < 20; ++i) { @@ -159,6 +167,8 @@ BOOST_AUTO_TEST_CASE(testMinimum) { test::CRandomNumbers rng; + std::size_t numberFolds{5}; + auto trend = [](double x) { return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); }; @@ -170,9 +180,10 @@ BOOST_AUTO_TEST_CASE(testMinimum) { } maths::CLowess<2> lowess; - lowess.fit(data, 5); + lowess.fit(data, numberFolds); - double x, fx; + double x; + double fx; std::tie(x, fx) = lowess.minimum(); // Expect minimum at ((3 / 2) * pi) / (2 pi / 20) = 15 and a value of around -8.0; From 92de10f0bf5efc0473f325dc080d3b8dc02fadc8 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 17:33:23 +0100 Subject: [PATCH 29/35] Typedef --- lib/maths/unittest/CLowessTest.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc index f59227aee4..57a2a4acbd 100644 --- a/lib/maths/unittest/CLowessTest.cc +++ b/lib/maths/unittest/CLowessTest.cc @@ -24,6 +24,7 @@ using namespace ml; using TDoubleVec = std::vector; using TDoubleVecVec = std::vector; +using TDoubleDoublePrVec = maths::CLowess<2>::TDoubleDoublePrVec; using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; BOOST_AUTO_TEST_CASE(testInvariants) { @@ -45,7 +46,7 @@ BOOST_AUTO_TEST_CASE(testInvariants) { TDoubleVec scale; TDoubleVec offset; TDoubleVec noise; - maths::CLowess<2>::TDoubleDoublePrVec data; + TDoubleDoublePrVec data; std::function trends[]{ [&](double x) { @@ -106,7 +107,7 @@ BOOST_AUTO_TEST_CASE(testSmooth) { return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); }; - maths::CLowess<2>::TDoubleDoublePrVec data; + TDoubleDoublePrVec data; for (std::size_t i = 0; i < 20; ++i) { double x{static_cast(i)}; data.emplace_back(x, trend(x)); @@ -140,7 +141,7 @@ BOOST_AUTO_TEST_CASE(testSmoothPlusNoise) { return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); }; - maths::CLowess<2>::TDoubleDoublePrVec data; + TDoubleDoublePrVec data; for (std::size_t i = 0; i < noise.size(); ++i) { double x{static_cast(i)}; data.emplace_back(x, trend(x) + noise[i]); @@ -173,7 +174,7 @@ BOOST_AUTO_TEST_CASE(testMinimum) { return 8.0 * std::sin(boost::math::double_constants::two_pi / 20.0 * x); }; - maths::CLowess<2>::TDoubleDoublePrVec data; + TDoubleDoublePrVec data; for (std::size_t i = 0; i < 20; ++i) { double x{static_cast(i)}; data.emplace_back(x, trend(x)); @@ -202,7 +203,7 @@ BOOST_AUTO_TEST_CASE(testTrainingLossCurves) { // 3. Boston using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; - using TDoubleDoublePrVecVec = std::vector::TDoubleDoublePrVec>; + using TDoubleDoublePrVecVec = std::vector; // clang-format off TDoubleDoublePrVecVec curves{ From d0be22f7f510fc797548d0f994b0ce62e5d3bedc Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Thu, 8 Jul 2021 17:41:09 +0100 Subject: [PATCH 30/35] Define small constant used to prefer fast training if test error is similar --- lib/maths/CBoostedTreeFactory.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 2ff19ccc80..5b6822e47a 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -39,6 +39,7 @@ const std::size_t BEST_PARAMETER_INDEX{1}; const std::size_t MAX_PARAMETER_INDEX{2}; const std::size_t MAX_LINE_SEARCH_ITERATIONS{10}; const double LINE_SEARCH_MINIMUM_RELATIVE_EI_TO_CONTINUE{0.01}; +const double SMALL_RELATIVE_TEST_LOSS_INCREASE{0.01}; const double MIN_ROWS_PER_FEATURE{20.0}; const double MIN_SOFT_DEPTH_LIMIT{2.0}; const double MIN_SOFT_DEPTH_LIMIT_TOLERANCE{0.05}; @@ -801,7 +802,8 @@ void CBoostedTreeFactory::initializeUnsetDownsampleFactor(core::CDataFrame& fram double minTestLoss, double testLoss) { return testLoss + CTools::linearlyInterpolate( logMinDownsampleFactor, logMaxDownsampleFactor, - 0.0, 0.01 * minTestLoss, logDownsampleFactor); + 0.0, SMALL_RELATIVE_TEST_LOSS_INCREASE * minTestLoss, + logDownsampleFactor); }; TVector fallback; @@ -865,10 +867,10 @@ void CBoostedTreeFactory::initializeUnsetFeatureBagFraction(core::CDataFrame& fr // larger than the minimum. auto adjustTestLoss = [=](double logFeatureBagFraction, double minTestLoss, double testLoss) { - return testLoss + - CTools::linearlyInterpolate( - logMinFeatureBagFraction, logMaxFeatureBagFraction, - 0.0, 0.01 * minTestLoss, logFeatureBagFraction); + return testLoss + CTools::linearlyInterpolate( + logMinFeatureBagFraction, logMaxFeatureBagFraction, + 0.0, SMALL_RELATIVE_TEST_LOSS_INCREASE * minTestLoss, + logFeatureBagFraction); }; TVector fallback; From a380b204ca8d3ebf6d5c12dbd472ecd608c9ae38 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 9 Jul 2021 09:49:48 +0100 Subject: [PATCH 31/35] We should record the fraction and number of training rows in the model meta data --- include/api/CInferenceModelMetadata.h | 15 ++++++++--- include/maths/CBoostedTree.h | 8 +++++- include/maths/CBoostedTreeFactory.h | 18 ++++++------- include/maths/CBoostedTreeImpl.h | 12 ++++++--- ...ataFrameAnalysisInstrumentationInterface.h | 2 +- include/maths/CDataFramePredictiveModel.h | 6 +++++ include/maths/CLowess.h | 2 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 9 ++++--- ...taFrameTrainBoostedTreeClassifierRunner.cc | 6 +++-- ...taFrameTrainBoostedTreeRegressionRunner.cc | 5 ++-- lib/api/CInferenceModelMetadata.cc | 25 +++++++++++++++++++ lib/maths/CBoostedTree.cc | 8 ++++++ lib/maths/CBoostedTreeImpl.cc | 21 +++++++++++++--- 13 files changed, 107 insertions(+), 30 deletions(-) diff --git a/include/api/CInferenceModelMetadata.h b/include/api/CInferenceModelMetadata.h index 099a374096..d2f124431c 100644 --- a/include/api/CInferenceModelMetadata.h +++ b/include/api/CInferenceModelMetadata.h @@ -40,8 +40,10 @@ class API_EXPORT CInferenceModelMetadata { static const std::string JSON_MEAN_MAGNITUDE_TAG; static const std::string JSON_MIN_TAG; static const std::string JSON_MODEL_METADATA_TAG; + static const std::string JSON_NUM_TRAINING_ROWS_TAG; static const std::string JSON_RELATIVE_IMPORTANCE_TAG; static const std::string JSON_TOTAL_FEATURE_IMPORTANCE_TAG; + static const std::string JSON_TRAIN_PARAMETERS_TAG; public: using TVector = maths::CDenseVector; @@ -64,6 +66,10 @@ class API_EXPORT CInferenceModelMetadata { //! to the baseline value). void featureImportanceBaseline(TVector&& baseline); void hyperparameterImportance(const maths::CBoostedTree::THyperparameterImportanceVec& hyperparameterImportance); + //! Set the number of rows used to train the model. + void numberTrainingRows(std::size_t numberRows); + //! Set the fraction of data per fold used for training when tuning hyperparameters. + void trainFractionPerFold(double fraction); private: struct SHyperparameterImportance { @@ -86,8 +92,9 @@ class API_EXPORT CInferenceModelMetadata { private: void writeTotalFeatureImportance(TRapidJsonWriter& writer) const; - void writeHyperparameterImportance(TRapidJsonWriter& writer) const; void writeFeatureImportanceBaseline(TRapidJsonWriter& writer) const; + void writeHyperparameterImportance(TRapidJsonWriter& writer) const; + void writeTrainParameters(TRapidJsonWriter& writer) const; private: TSizeMeanAccumulatorUMap m_TotalShapValuesMean; @@ -95,11 +102,13 @@ class API_EXPORT CInferenceModelMetadata { TOptionalVector m_ShapBaseline; TStrVec m_ColumnNames; TStrVec m_ClassValues; - TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter = + TPredictionFieldTypeResolverWriter m_PredictionFieldTypeResolverWriter{ [](const std::string& value, TRapidJsonWriter& writer) { writer.String(value); - }; + }}; THyperparametersVec m_HyperparameterImportance; + std::size_t m_NumberTrainingRows{0}; + double m_TrainFractionPerFold{0.0}; }; } } diff --git a/include/maths/CBoostedTree.h b/include/maths/CBoostedTree.h index 028c92cbde..a1a3898f4c 100644 --- a/include/maths/CBoostedTree.h +++ b/include/maths/CBoostedTree.h @@ -208,7 +208,7 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel { class MATHS_EXPORT CVisitor : public CDataFrameCategoryEncoder::CVisitor, public CBoostedTreeNode::CVisitor { public: - virtual ~CVisitor() = default; + ~CVisitor() override = default; virtual void addTree() = 0; virtual void addClassificationWeights(TDoubleVec weights) = 0; virtual void addLossFunction(const TLossFunction& lossFunction) = 0; @@ -236,6 +236,12 @@ class MATHS_EXPORT CBoostedTree final : public CDataFramePredictiveModel { //! Get the vector of hyperparameter importances. THyperparameterImportanceVec hyperparameterImportance() const; + //! Get the number of rows used to train the model. + std::size_t numberTrainingRows() const override; + + //! Get the fraction of data per fold used for training when tuning hyperparameters. + double trainFractionPerFold() const override; + //! Get the column containing the dependent variable. std::size_t columnHoldingDependentVariable() const override; diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index ec19a654d3..537c6eb933 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -278,14 +278,14 @@ class MATHS_EXPORT CBoostedTreeFactory final { private: TOptionalDouble m_MinimumFrequencyToOneHotEncode; TOptionalSize m_BayesianOptimisationRestarts; - bool m_StratifyRegressionCrossValidation = true; - double m_InitialDownsampleRowsPerFeature = 200.0; - std::size_t m_MaximumNumberOfTrainRows = 500000; - double m_GainPerNode1stPercentile = 0.0; - double m_GainPerNode50thPercentile = 0.0; - double m_GainPerNode90thPercentile = 0.0; - double m_TotalCurvaturePerNode1stPercentile = 0.0; - double m_TotalCurvaturePerNode90thPercentile = 0.0; + bool m_StratifyRegressionCrossValidation{true}; + double m_InitialDownsampleRowsPerFeature{200.0}; + std::size_t m_MaximumNumberOfTrainRows{500000}; + double m_GainPerNode1stPercentile{0.0}; + double m_GainPerNode50thPercentile{0.0}; + double m_GainPerNode90thPercentile{0.0}; + double m_TotalCurvaturePerNode1stPercentile{0.0}; + double m_TotalCurvaturePerNode90thPercentile{0.0}; std::size_t m_NumberThreads; TBoostedTreeImplUPtr m_TreeImpl; TVector m_LogDownsampleFactorSearchInterval; @@ -295,7 +295,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { TVector m_LogLeafWeightPenaltyMultiplierSearchInterval; TVector m_SoftDepthLimitSearchInterval; TVector m_LogEtaSearchInterval; - TTrainingStateCallback m_RecordTrainingState = noopRecordTrainingState; + TTrainingStateCallback m_RecordTrainingState{noopRecordTrainingState}; }; } } diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index e102529607..b76c996b0d 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -150,6 +150,13 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! \return The best hyperparameters for validation error found so far. const CBoostedTreeHyperparameters& bestHyperparameters() const; + //! \return The fraction of data we use for train per fold when tuning hyperparameters. + double trainFractionPerFold() const; + + //! \return The full training set data mask, i.e. all rows which aren't missing + //! the dependent variable. + core::CPackedBitVector allTrainingRowsMask() const; + //!\ name Test Only //@{ //! The name of the object holding the best hyperaparameters in the state document. @@ -203,9 +210,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Check if we can train a model. bool canTrain() const; - //! Get the full training set data mask, i.e. all rows which aren't missing - //! the dependent variable. - core::CPackedBitVector allTrainingRowsMask() const; + //! Get the mean number of training examples which are used in each fold. + double meanNumberTrainingRowsPerFold() const; //! Compute the \p percentile percentile gain per split and the sum of row //! curvatures per internal node of \p forest. diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index f6b35916b0..bd2685d24a 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -116,7 +116,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface SRegularization s_Regularization; double s_DownsampleFactor{-1.0}; std::size_t s_NumFolds{0}; - double s_TrainFractionPerFold{0.0}; + double s_NumTrainingRows{0}; std::size_t s_MaxTrees{0}; double s_FeatureBagFraction{-1.0}; double s_EtaGrowthRatePerTree{-1.0}; diff --git a/include/maths/CDataFramePredictiveModel.h b/include/maths/CDataFramePredictiveModel.h index 57ac26cb31..df22fe6d0a 100644 --- a/include/maths/CDataFramePredictiveModel.h +++ b/include/maths/CDataFramePredictiveModel.h @@ -61,6 +61,12 @@ class MATHS_EXPORT CDataFramePredictiveModel { //! \warning Will return a nullptr if a trained model isn't available. virtual CTreeShapFeatureImportance* shap() const = 0; + //! Get the number of rows used to train the model. + virtual std::size_t numberTrainingRows() const = 0; + + //! Get the fraction of data per fold used for training when tuning hyperparameters. + virtual double trainFractionPerFold() const = 0; + //! Get the column containing the dependent variable. virtual std::size_t columnHoldingDependentVariable() const = 0; diff --git a/include/maths/CLowess.h b/include/maths/CLowess.h index b79b1a3c11..07cf5de50c 100644 --- a/include/maths/CLowess.h +++ b/include/maths/CLowess.h @@ -47,7 +47,7 @@ class CLowess { //! \note Defined as (0,0) if no data have been fit. TDoubleDoublePr minimum() const; - //! \name Test Functions + //! \name Test Only //@{ //! Get an estimate of residual variance at the observed values. //! diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 5d059cea08..9beb495ee3 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -483,10 +483,11 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson:: rapidjson::Value(static_cast(this->m_Hyperparameters.s_NumFolds)) .Move(), parentObject); - writer->addMember( - CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD, - rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(), - parentObject); + // TODO enable with Java changes. + //writer->addMember( + // CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD, + // rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(), + // parentObject); writer->addMember( CDataFrameTrainBoostedTreeRunner::MAX_TREES, rapidjson::Value(static_cast(this->m_Hyperparameters.s_MaxTrees)) diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc index bdf46cb515..8831ab5d7a 100644 --- a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc @@ -322,12 +322,14 @@ CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelDefinition( CDataFrameAnalysisRunner::TOptionalInferenceModelMetadata CDataFrameTrainBoostedTreeClassifierRunner::inferenceModelMetadata() const { - const auto& featureImportance = this->boostedTree().shap(); - if (featureImportance) { + auto* featureImportance = this->boostedTree().shap(); + if (featureImportance != nullptr) { m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline()); } m_InferenceModelMetadata.hyperparameterImportance( this->boostedTree().hyperparameterImportance()); + m_InferenceModelMetadata.numberTrainingRows(this->boostedTree().numberTrainingRows()); + m_InferenceModelMetadata.trainFractionPerFold(this->boostedTree().trainFractionPerFold()); return m_InferenceModelMetadata; } diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc index 04613276b5..37e563bddf 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc @@ -156,12 +156,13 @@ CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelDefinition( CDataFrameAnalysisRunner::TOptionalInferenceModelMetadata CDataFrameTrainBoostedTreeRegressionRunner::inferenceModelMetadata() const { - const auto& featureImportance = this->boostedTree().shap(); - if (featureImportance) { + auto* featureImportance = this->boostedTree().shap(); + if (featureImportance != nullptr) { m_InferenceModelMetadata.featureImportanceBaseline(featureImportance->baseline()); } m_InferenceModelMetadata.hyperparameterImportance( this->boostedTree().hyperparameterImportance()); + m_InferenceModelMetadata.trainFractionPerFold(this->boostedTree().trainFractionPerFold()); return m_InferenceModelMetadata; } diff --git a/lib/api/CInferenceModelMetadata.cc b/lib/api/CInferenceModelMetadata.cc index c7a8af2a50..2d0948445e 100644 --- a/lib/api/CInferenceModelMetadata.cc +++ b/lib/api/CInferenceModelMetadata.cc @@ -19,6 +19,7 @@ void CInferenceModelMetadata::write(TRapidJsonWriter& writer) const { this->writeTotalFeatureImportance(writer); this->writeFeatureImportanceBaseline(writer); this->writeHyperparameterImportance(writer); + this->writeTrainParameters(writer); } void CInferenceModelMetadata::writeTotalFeatureImportance(TRapidJsonWriter& writer) const { @@ -171,6 +172,20 @@ void CInferenceModelMetadata::writeHyperparameterImportance(TRapidJsonWriter& wr writer.EndArray(); } +void CInferenceModelMetadata::writeTrainParameters(TRapidJsonWriter& writer) const { + // TODO enable with Java changes. + // Only write out if it has been set. + //if (m_TrainingFractionPerFold > 0.0) { + // writer.Key(JSON_TRAIN_PARAMETERS_TAG); + // writer.StartObject(); + // writer.Key(JSON_NUM_TRAINING_ROWS_TAG); + // writer.Uint64(m_NumberRowsUsedForTrain); + // writer.Key(CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD); + // writer.Double(m_TrainingFractionPerFold); + // writer.EndObject(); + //} +} + const std::string& CInferenceModelMetadata::typeString() { return JSON_MODEL_METADATA_TAG; } @@ -260,6 +275,14 @@ void CInferenceModelMetadata::hyperparameterImportance( }); } +void CInferenceModelMetadata::numberTrainingRows(std::size_t numberRows) { + m_NumberTrainingRows = numberRows; +} + +void CInferenceModelMetadata::trainFractionPerFold(double fraction) { + m_TrainFractionPerFold = fraction; +} + // clang-format off const std::string CInferenceModelMetadata::JSON_ABSOLUTE_IMPORTANCE_TAG{"absolute_importance"}; const std::string CInferenceModelMetadata::JSON_BASELINE_TAG{"baseline"}; @@ -276,8 +299,10 @@ const std::string CInferenceModelMetadata::JSON_MAX_TAG{"max"}; const std::string CInferenceModelMetadata::JSON_MEAN_MAGNITUDE_TAG{"mean_magnitude"}; const std::string CInferenceModelMetadata::JSON_MIN_TAG{"min"}; const std::string CInferenceModelMetadata::JSON_MODEL_METADATA_TAG{"model_metadata"}; +const std::string CInferenceModelMetadata::JSON_NUM_TRAINING_ROWS_TAG{"num_training_rows"}; const std::string CInferenceModelMetadata::JSON_RELATIVE_IMPORTANCE_TAG{"relative_importance"}; const std::string CInferenceModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG{"total_feature_importance"}; +const std::string CInferenceModelMetadata::JSON_TRAIN_PARAMETERS_TAG{"train_parameters"}; // clang-format on } } diff --git a/lib/maths/CBoostedTree.cc b/lib/maths/CBoostedTree.cc index 2af5a5ab07..aa67fad1d1 100644 --- a/lib/maths/CBoostedTree.cc +++ b/lib/maths/CBoostedTree.cc @@ -167,6 +167,14 @@ CBoostedTree::THyperparameterImportanceVec CBoostedTree::hyperparameterImportanc return m_Impl->hyperparameterImportance(); } +std::size_t CBoostedTree::numberTrainingRows() const { + return static_cast(m_Impl->allTrainingRowsMask().manhattan()); +} + +double CBoostedTree::trainFractionPerFold() const { + return m_Impl->trainFractionPerFold(); +} + std::size_t CBoostedTree::columnHoldingDependentVariable() const { return m_Impl->columnHoldingDependentVariable(); } diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 7cc581cd0b..921d1df5cf 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -262,7 +262,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, this->restoreBestHyperparameters(); this->scaleRegularizers(allTrainingRowsMask.manhattan() / - m_TrainingRowMasks[0].manhattan()); + this->meanNumberTrainingRowsPerFold()); this->startProgressMonitoringFinalTrain(); // reinitialize random number generator for reproducible results // TODO #1866 introduce accept randomize_seed configuration parameter @@ -404,8 +404,12 @@ bool CBoostedTreeImpl::canTrain() const { m_FeatureSampleProbabilities.end(), 0.0) > 0.0; } -core::CPackedBitVector CBoostedTreeImpl::allTrainingRowsMask() const { - return ~m_MissingFeatureRowMasks[m_DependentVariable]; +double CBoostedTreeImpl::meanNumberTrainingRowsPerFold() const { + TMeanAccumulator result; + for (const auto& mask : m_TrainingRowMasks) { + result.add(mask.manhattan()); + } + return CBasicStatistics::mean(result); } CBoostedTreeImpl::TDoubleDoublePr @@ -1530,7 +1534,8 @@ void CBoostedTreeImpl::recordHyperparameters() { m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective; m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor; m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds; - m_Instrumentation->hyperparameters().s_TrainFractionPerFold = m_TrainFractionPerFold; + m_Instrumentation->hyperparameters().s_NumTrainingRows = + this->meanNumberTrainingRowsPerFold(); m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees; m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction; m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree; @@ -2128,6 +2133,14 @@ const CBoostedTreeImpl::TVector& CBoostedTreeImpl::classificationWeights() const return m_ClassificationWeights; } +double CBoostedTreeImpl::trainFractionPerFold() const { + return m_TrainFractionPerFold; +} + +core::CPackedBitVector CBoostedTreeImpl::allTrainingRowsMask() const { + return ~m_MissingFeatureRowMasks[m_DependentVariable]; +} + const double CBoostedTreeImpl::MINIMUM_RELATIVE_GAIN_PER_SPLIT{1e-7}; } } From 06460e666852a71b8dd27b581b378646fdc9b181 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 9 Jul 2021 11:59:19 +0100 Subject: [PATCH 32/35] Handle case we don't need to sample for last fold --- include/maths/CLowessDetail.h | 4 ++-- lib/maths/CDataFrameUtils.cc | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/maths/CLowessDetail.h b/include/maths/CLowessDetail.h index 80d914126a..30cd67d0ff 100644 --- a/include/maths/CLowessDetail.h +++ b/include/maths/CLowessDetail.h @@ -47,12 +47,12 @@ void CLowess::fit(TDoubleDoublePrVec data, std::size_t numberFolds) { // // f(x | p^*) = poly(x | p^*(x)) // - // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } (2) + // p^*(x) = argmin_p{ sum_i{ w_i (Y_i - poly(X_i | p))^2 } } (1) // // where w = exp(-k (x - X_i)), (X, Y) are the data to fit and p is the vector // of parameters for the polynomial function poly(. | p), i.e. the coefficients // p_0 + p_1 x + p_2 x^2 ... (which are determined by minimizing the weighted - // least square prediction errors as in (2)). + // least square prediction errors as in (1)). // // We determine k by solving // diff --git a/lib/maths/CDataFrameUtils.cc b/lib/maths/CDataFrameUtils.cc index dd074eacc3..203ce3ef78 100644 --- a/lib/maths/CDataFrameUtils.cc +++ b/lib/maths/CDataFrameUtils.cc @@ -570,8 +570,13 @@ CDataFrameUtils::stratifiedCrossValidationRowMasks(std::size_t numberThreads, core::CPackedBitVector candidateTestingRowsMask{allTrainingRowsMask}; for (auto& testingRowMask : testingRowMasks) { - testingRowMask = sample(sampler, candidateTestingRowsMask); - candidateTestingRowsMask ^= testingRowMask; + if (static_cast(candidateTestingRowsMask.manhattan()) <= sampleSize) { + testingRowMask = std::move(candidateTestingRowsMask); + candidateTestingRowsMask = core::CPackedBitVector{testingRowMask.size(), false}; + } else { + testingRowMask = sample(sampler, candidateTestingRowsMask); + candidateTestingRowsMask ^= testingRowMask; + } if (excessSampler != nullptr) { testingRowMask |= sample(excessSampler, allTrainingRowsMask ^ testingRowMask); } From ad037ec87029264a23fabab82505798bc9cba05c Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 9 Jul 2021 15:02:35 +0100 Subject: [PATCH 33/35] Add an explanation of variance treatment in BO --- lib/maths/CBayesianOptimisation.cc | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/lib/maths/CBayesianOptimisation.cc b/lib/maths/CBayesianOptimisation.cc index 457249c03d..b4296d125d 100644 --- a/lib/maths/CBayesianOptimisation.cc +++ b/lib/maths/CBayesianOptimisation.cc @@ -616,6 +616,35 @@ CBayesianOptimisation::TVector CBayesianOptimisation::function() const { } double CBayesianOptimisation::meanErrorVariance() const { + + // So what are we doing here? When we supply function values we also supply their + // error variance. Typically these might be the mean test loss function across + // folds and their variance for a particular choice of hyperparameters. Sticking + // with this example, the variance allows us to estimate the error w.r.t. the + // true generalisation error due to finite sample size. We can think of the source + // of this variance as being due to two effects: one which shifts the loss values + // in each fold (this might be due to some folds simply having more hard examples) + // and another which permutes the order of loss values. A shift in the loss function + // is not something we wish to capture in the GP: it shouldn't materially affect + // where to choose points to test since any sensible optimisation strategy should + // only care about the difference in loss between points, which is unaffected by a + // shift. More formally, if we assume the shift and permutation errors are independent + // we have for losses l_i, mean loss per fold m_i and mean loss for a given set of + // hyperparameters m that the variance is + // + // sum_i{ (l_i - m)^2 } = sum_i{ (l_i - m_i + m_i - m)^2 } + // = sum_i{ (l_i - m_i)^2 } + sum_i{ (m_i - m)^2 } + // = "permutation variance" + "shift variance" (1) + // + // with the cross-term expected to be small by independence. (Note, the independence + // assumption is reasonable if one assumes that the shift is due to mismatch in hard + // examples since the we choose folds independently at random.) We can estimate the + // shift variance by looking at mean loss over all distinct hyperparameter settings + // and we assume it is supplied as the parameter m_ExplainedErrorVariance. It should + // also be smaller than the variance by construction although for numerical stability + // we prevent the difference becoming too small. As discussed, here we wish return + // the permutation variance which we get by rearranging (1). + TMeanAccumulator variance; variance.add(m_ErrorVariances); return CBasicStatistics::mean(variance) - From e58ed73a77373d6db59663cfa73f5680a8f32280 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Fri, 9 Jul 2021 15:27:48 +0100 Subject: [PATCH 34/35] Comments --- lib/maths/unittest/CLowessTest.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/maths/unittest/CLowessTest.cc b/lib/maths/unittest/CLowessTest.cc index 57a2a4acbd..7c1343a918 100644 --- a/lib/maths/unittest/CLowessTest.cc +++ b/lib/maths/unittest/CLowessTest.cc @@ -31,14 +31,6 @@ BOOST_AUTO_TEST_CASE(testInvariants) { // Test invariants are satisfied on random input. - // We check: - // 1. Minimum is a local minimum. - // 2. The sublevel set contains the minimum. - // 3. The minimum is within 10% of the training data interval. - // 4. The ends of the sublevel set is within 10% of the training data interval. - // 5. The variance is greater than or equal to the variance of the residuals at - // the training data. - test::CRandomNumbers rng; std::size_t numberFolds{5}; @@ -58,6 +50,8 @@ BOOST_AUTO_TEST_CASE(testInvariants) { return scale[0] * (x - offset[0]) * (x - offset[0]) / 100.0; }}; + // We check... + for (std::size_t i = 0; i < 100; ++i) { for (const auto& trend : trends) { @@ -78,6 +72,7 @@ BOOST_AUTO_TEST_CASE(testInvariants) { double xeb; std::tie(xea, xeb) = lowess.extrapolationInterval(); + // 1. The minimum is a local minimum. double xmin; double fmin; std::tie(xmin, fmin) = lowess.minimum(); @@ -85,6 +80,12 @@ BOOST_AUTO_TEST_CASE(testInvariants) { BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::max(xmin - 0.1, xea))); BOOST_TEST_REQUIRE(fmin <= lowess.predict(std::min(xmin + 0.1, xeb))); + // 2. The minimum is within the maximum extrapolation interval. + BOOST_TEST_REQUIRE(xmin >= xea); + BOOST_TEST_REQUIRE(xmin <= xeb); + + // 3. The variance is greater than the variance of the residual at the + // training data. TMeanVarAccumulator residualMoments; for (const auto& x : data) { residualMoments.add(x.second - lowess.predict(x.first)); From e0a61bfe43abdac2c4a82e97e29301c2a412398c Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 12 Jul 2021 11:43:33 +0100 Subject: [PATCH 35/35] Move fraction of training data into its own section in instrumentation --- .../api/CDataFrameAnalysisInstrumentation.h | 26 ++++++++++-------- ...ataFrameAnalysisInstrumentationInterface.h | 19 ++++++------- lib/api/CDataFrameAnalysisInstrumentation.cc | 27 ++++++++++++++----- lib/maths/CBoostedTreeImpl.cc | 3 +-- 4 files changed, 47 insertions(+), 28 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index e5c7fefd22..e0247c5e80 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -184,17 +184,19 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId, std::size_t memoryLimit) : CDataFrameAnalysisInstrumentation(jobId, memoryLimit) {} - //! Supervised learning job \p type, can be E_Regression or E_Classification. + //! Set the supervised learning job \p type, can be E_Regression or E_Classification. void type(EStatsType type) override; - //! Current \p iteration number. + //! Set the current \p iteration number. void iteration(std::size_t iteration) override; - //! Run time of the iteration. + //! Set the run time of the current iteration. void iterationTime(std::uint64_t delta) override; - //! Type of the validation loss result, e.g. "mse". + //! Set the type of the validation loss result, e.g. "mse". void lossType(const std::string& lossType) override; - //! List of \p lossValues of validation error for the given \p fold. + //! Set the validation loss values for \p fold for each forest size to \p lossValues. void lossValues(std::size_t fold, TDoubleVec&& lossValues) override; - //! \return Structure contains hyperparameters. + //! Set the fraction of data used for training per fold. + void trainingFractionPerFold(double fraction) override; + //! \return A writable object containing the training hyperparameters. SHyperparameters& hyperparameters() override { return m_Hyperparameters; } protected: @@ -206,19 +208,21 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final private: void writeAnalysisStats(std::int64_t timestamp) override; + void writeMetaData(rapidjson::Value& parentObject); void writeHyperparameters(rapidjson::Value& parentObject); void writeValidationLoss(rapidjson::Value& parentObject); void writeTimingStats(rapidjson::Value& parentObject); void reset(); private: - EStatsType m_Type = E_Regression; - std::size_t m_Iteration = 0; - std::uint64_t m_IterationTime = 0; - std::uint64_t m_ElapsedTime = 0; - bool m_AnalysisStatsInitialized = false; + EStatsType m_Type{E_Regression}; + std::size_t m_Iteration{0}; + std::uint64_t m_IterationTime{0}; + std::uint64_t m_ElapsedTime{0}; + bool m_AnalysisStatsInitialized{false}; std::string m_LossType; TLossVec m_LossValues; + double m_TrainingFractionPerFold{0.0}; SHyperparameters m_Hyperparameters; }; } diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index bd2685d24a..294648237f 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -33,7 +33,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface { //! Adds \p delta to the memory usage statistics. virtual void updateMemoryUsage(std::int64_t delta) = 0; - //! Start progress monitoring for \p phase. + //! Start progress monitoring of \p task. //! //! \note This resets the current progress to zero. virtual void startNewProgressMonitoredTask(const std::string& task) = 0; @@ -116,7 +116,6 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface SRegularization s_Regularization; double s_DownsampleFactor{-1.0}; std::size_t s_NumFolds{0}; - double s_NumTrainingRows{0}; std::size_t s_MaxTrees{0}; double s_FeatureBagFraction{-1.0}; double s_EtaGrowthRatePerTree{-1.0}; @@ -127,18 +126,19 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface using TDoubleVec = std::vector; public: - virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default; - //! Supervised learning job \p type, can be E_Regression or E_Classification. + //! Set the supervised learning job \p type, can be E_Regression or E_Classification. virtual void type(EStatsType type) = 0; - //! Current \p iteration number. + //! Set the current \p iteration number. virtual void iteration(std::size_t iteration) = 0; - //! Run time of the iteration. + //! Set the run time of the current iteration. virtual void iterationTime(std::uint64_t delta) = 0; - //! Type of the validation loss result, e.g. "mse". + //! Set the type of the validation loss result, e.g. "mse". virtual void lossType(const std::string& lossType) = 0; - //! List of \p lossValues of validation error for the given \p fold. + //! Set the validation loss values for \p fold for each forest size to \p lossValues. virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0; - //! \return Structure contains hyperparameters. + //! Set the fraction of data used for training per fold. + virtual void trainingFractionPerFold(double fraction) = 0; + //! \return A writable object containing the training hyperparameters. virtual SHyperparameters& hyperparameters() = 0; }; @@ -168,6 +168,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub void iterationTime(std::uint64_t /* delta */) override {} void lossType(const std::string& /* lossType */) override {} void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override {} + void trainingFractionPerFold(double /* fraction */) override {} SHyperparameters& hyperparameters() override { return m_Hyperparameters; } private: diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 9beb495ee3..c552ad4ebb 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -46,6 +46,7 @@ const std::string MEMORY_STATUS_HARD_LIMIT_TAG{"hard_limit"}; const std::string MEMORY_STATUS_OK_TAG{"ok"}; const std::string MEMORY_STATUS_TAG{"status"}; const std::string MEMORY_TYPE_TAG{"analytics_memory_usage"}; +const std::string META_DATA_TAG{"meta_data"}; const std::string OUTLIER_DETECTION_STATS{"outlier_detection_stats"}; const std::string PARAMETERS_TAG{"parameters"}; const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"}; @@ -387,7 +388,11 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossType(const std::string& loss void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::size_t fold, TDoubleVec&& lossValues) { - m_LossValues.emplace_back(std::move(fold), std::move(lossValues)); + m_LossValues.emplace_back(fold, std::move(lossValues)); +} + +void CDataFrameTrainBoostedTreeInstrumentation::trainingFractionPerFold(double fraction) { + m_TrainingFractionPerFold = fraction; } void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { @@ -424,6 +429,12 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t writer->Key(TIMING_STATS_TAG); writer->write(timingStatsObject); + // TODO enable with Java changes. + //rapidjson::Value metaDataObject{writer->makeObject()}; + //this->writeMetaData(metaDataObject); + //writer->Key(META_DATA_TAG); + //writer->write(metaDataObject); + writer->EndObject(); } this->reset(); @@ -434,6 +445,14 @@ void CDataFrameTrainBoostedTreeInstrumentation::reset() { m_LossValues.clear(); } +void CDataFrameTrainBoostedTreeInstrumentation::writeMetaData(rapidjson::Value& parentObject) { + auto* writer = this->writer(); + if (writer != nullptr) { + writer->addMember(CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD, + rapidjson::Value(m_TrainingFractionPerFold).Move(), parentObject); + } +} + void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) { auto* writer = this->writer(); @@ -483,11 +502,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson:: rapidjson::Value(static_cast(this->m_Hyperparameters.s_NumFolds)) .Move(), parentObject); - // TODO enable with Java changes. - //writer->addMember( - // CDataFrameTrainBoostedTreeRunner::TRAIN_FRACTION_PER_FOLD, - // rapidjson::Value(this->m_Hyperparameters.s_TrainFractionPerFold).Move(), - // parentObject); writer->addMember( CDataFrameTrainBoostedTreeRunner::MAX_TREES, rapidjson::Value(static_cast(this->m_Hyperparameters.s_MaxTrees)) @@ -539,6 +553,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::V writer->addMember(VALIDATION_FOLD_VALUES_TAG, lossValuesArray, parentObject); } } + void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& parentObject) { auto* writer = this->writer(); if (writer != nullptr) { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 921d1df5cf..c53594ff91 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -1530,12 +1530,11 @@ std::size_t CBoostedTreeImpl::maximumTreeSize(std::size_t numberRows) const { } void CBoostedTreeImpl::recordHyperparameters() { + m_Instrumentation->trainingFractionPerFold(m_TrainFractionPerFold); m_Instrumentation->hyperparameters().s_Eta = m_Eta; m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective; m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor; m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds; - m_Instrumentation->hyperparameters().s_NumTrainingRows = - this->meanNumberTrainingRowsPerFold(); m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees; m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction; m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree;