Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Stop cross-validation early if the parameters have high predicted test loss #915

Merged
merged 13 commits into from
Jan 10, 2020
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ BOOST_FIXTURE_TEST_CASE(testRunBoostedTreeRegressionFeatureImportanceNoImportanc
double c4{result["row_results"]["results"]["ml"][maths::CDataFrameRegressionModel::SHAP_PREFIX + "c4"]
.GetDouble()};
double prediction{
result["row_results"]["results"]["ml"]["c5_prediction"].GetDouble()};
result["row_results"]["results"]["ml"]["target_prediction"].GetDouble()};
// c1 explains 95% of the prediction value.
BOOST_REQUIRE_CLOSE(c1, prediction, 5.0);
BOOST_REQUIRE_SMALL(c2, 2.0);
Expand Down
49 changes: 32 additions & 17 deletions lib/maths/CBoostedTreeFactory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ const double MIN_DOWNSAMPLE_LINE_SEARCH_RANGE{2.0};
const double MAX_DOWNSAMPLE_LINE_SEARCH_RANGE{144.0};
const double MIN_DOWNSAMPLE_FACTOR_SCALE{0.3};
const double MAX_DOWNSAMPLE_FACTOR_SCALE{3.0};
const std::size_t MAX_NUMBER_FOLDS{5};
// This isn't a hard limit be we increase the number of default training folds
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, something wrong with this sentence.

// if the initial downsample fraction would be larger than this.
const double MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION{0.5};
const double MAX_NUMBER_FOLDS{5.0};
const std::size_t MAX_NUMBER_TREES{static_cast<std::size_t>(2.0 / MIN_ETA + 0.5)};

double computeEta(std::size_t numberRegressors) {
Expand Down Expand Up @@ -250,20 +253,32 @@ void CBoostedTreeFactory::initializeNumberFolds(core::CDataFrame& frame) const {
}
LOG_TRACE(<< "total number training rows = " << totalNumberTrainingRows);

// We require at least twice the number of rows we'll sample in a bag per
// fold if possible. In order to estimate this we use the number of input
// features as a proxy for the number of features we'll actually use after
// feature selection.
double desiredTrainingFraction{(m_InitialDownsampleRowsPerFeature *
static_cast<double>(frame.numberColumns() - 1)) /
static_cast<double>(totalNumberTrainingRows)};
if (2.0 * desiredTrainingFraction >= 1.0 - 1.0 / static_cast<double>(MAX_NUMBER_FOLDS)) {
m_TreeImpl->m_NumberFolds = MAX_NUMBER_FOLDS;
} else {
m_TreeImpl->m_NumberFolds = static_cast<std::size_t>(
std::ceil(1.0 / (1.0 - 2.0 * desiredTrainingFraction)));
}
LOG_TRACE(<< "desired training fraction = " << desiredTrainingFraction
// We want to choose the number of folds so we'll have enough training data
// after leaving out one fold. We choose the initial downsample size based
// on the same sort of criterion. So we require that leaving out one fold
// shouldn't mean than we have fewer rows than constant * desired downsample
// # rows if possible. We choose the constant to be two for no particularly
// good reason except that:
// 1. it isn't too large
// 2. it still means we'll have plenty of variation between random bags.
//
// In order to estimate this we use the number of input features as a proxy
// for the number of features we'll actually use after feature selection.
//
// So how does the following work: we'd like "c * f * # rows" training rows.
// For k folds we'll have "(1 - 1 / k) * # rows" training rows. So we want
// to find the smallest integer k s.t. c * f * # rows <= (1 - 1 / k) * # rows.
// This gives k = ceil(1 / (1 - c * f)). However, we also upper bound this
// by MAX_NUMBER_FOLDS.
Comment on lines +268 to +272
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very nice explanation!


double initialDownsampleFraction{(m_InitialDownsampleRowsPerFeature *
static_cast<double>(frame.numberColumns() - 1)) /
static_cast<double>(totalNumberTrainingRows)};

m_TreeImpl->m_NumberFolds = static_cast<std::size_t>(
std::ceil(1.0 / std::max(1.0 - initialDownsampleFraction / MAX_DESIRED_INITIAL_DOWNSAMPLE_FRACTION,
1.0 / MAX_NUMBER_FOLDS)));
LOG_TRACE(<< "initial downsample fraction = " << initialDownsampleFraction
<< " # folds = " << m_TreeImpl->m_NumberFolds);
} else {
m_TreeImpl->m_NumberFolds = *m_TreeImpl->m_NumberFoldsOverride;
Expand Down Expand Up @@ -387,10 +402,10 @@ void CBoostedTreeFactory::initializeHyperparameters(core::CDataFrame& frame) {
}

double numberFeatures{static_cast<double>(m_TreeImpl->m_Encoder->numberEncodedColumns())};
double downSampleFactor{m_InitialDownsampleRowsPerFeature * numberFeatures /
double downsampleFactor{m_InitialDownsampleRowsPerFeature * numberFeatures /
m_TreeImpl->m_TrainingRowMasks[0].manhattan()};
m_TreeImpl->m_DownsampleFactor = m_TreeImpl->m_DownsampleFactorOverride.value_or(
CTools::truncate(downSampleFactor, 0.05, 0.5));
CTools::truncate(downsampleFactor, 0.05, 0.5));

m_TreeImpl->m_Regularization
.depthPenaltyMultiplier(
Expand Down
35 changes: 26 additions & 9 deletions lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1080,12 +1080,28 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
// estimate the test loss we'll see for the remaining folds to decide if it
// is worthwhile to continue training with these parameters and to correct
// the loss value supplied to Bayesian Optimisation to account for the folds
// we haven't trained on. We achieve this by for each missing fold fitting an
// OLS to the data (x_i, loss(m_i)) where i ranges over the previous rounds
// and x_i is the i'th vector whose components comprise the losses for which
// we have values in the current round and indicators for whether they were
// missing in the i'th round. We only include a round if we've trained for at
// least one of the same folds in the current round.
// we haven't trained on. We tackle this problem as follows:
// 1. Find all previous rounds R which share at least one fold with the
// current round, i.e. one fold for which we've computed the actual
// loss for the current round parameters
// 2. For each fold f_i for which we haven't estimated the loss in this
// round fit an OLS model m_i to R to predict the loss of f_i.
// 3. Compute l_i^ the predicted value for the test loss on each f_i given
// the test losses we've computed so far this round using m_i.
// 4. Estimate its uncertainty from the variance of the residuals from
// fitting the model m_i to R.
//
// The feature vector we use is defined as:
//
// | calculated fold error 1 |
// | calculated fold error 2 |
// | ... |
// | 1{fold error 1 is present} |
// | 1{fold error 2 is present} |
// | ... |
//
// where the indices range over the folds for which we have errors in the
// current round.

TSizeVec present(m_NumberFolds);
std::iota(present.begin(), present.end(), 0);
Expand All @@ -1094,7 +1110,7 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
CSetTools::inplace_set_difference(present, ordered.begin(), ordered.end());
LOG_TRACE(<< "present = " << core::CContainerPrinter::print(present));

// Get the current round feature vector.
// Get the current round feature vector. Fixed so computed outside the loop.
TVector x(2 * present.size());
for (std::size_t col = 0; col < present.size(); ++col) {
x(col) = *m_FoldRoundTestLosses[present[col]][m_CurrentRound];
Expand Down Expand Up @@ -1142,11 +1158,12 @@ CBoostedTreeImpl::estimateMissingTestLosses(const TSizeVec& missing) const {
double predictedTestLoss{params.transpose() * x};
double predictedTestLossVariance{
CBasicStatistics::maximumLikelihoodVariance(residualMoments)};
predictedTestLosses.push_back(CBasicStatistics::momentsAccumulator(
1.0, predictedTestLoss, predictedTestLossVariance));
LOG_TRACE(<< "prediction(x = " << x.transpose() << ", fold = " << target
<< ") = (mean = " << predictedTestLoss
<< ", variance = " << predictedTestLossVariance << ")");

predictedTestLosses.push_back(CBasicStatistics::momentsAccumulator(
1.0, predictedTestLoss, predictedTestLossVariance));
}

return predictedTestLosses;
Expand Down
6 changes: 2 additions & 4 deletions lib/maths/CTreeShapFeatureImportance.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,10 @@ void CTreeShapFeatureImportance::shapRecursive(const TTree& tree,
parentFractionOne, parentFeatureIndex);
if (tree[nodeIndex].isLeaf()) {
double leafValue = tree[nodeIndex].value();
for (std::size_t i = 1; i <= splitPath.depth(); ++i) {
for (int i = 1; i <= splitPath.depth(); ++i) {
double scale = CTreeShapFeatureImportance::sumUnwoundPath(splitPath, i);
std::size_t inputColumnIndex{
encoder
.encoding(static_cast<std::size_t>(splitPath.featureIndex(i)))
.inputColumnIndex()};
encoder.encoding(splitPath.featureIndex(i)).inputColumnIndex()};
// inputColumnIndex is read by seeing what the feature at position i is on the path to this leaf.
// fractionOnes(i) is an indicator variable which tells us if we condition on this variable
// do we visit this path from that node or not, fractionZeros(i) tells us what proportion of
Expand Down