Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
a6fc867
write memory in the new format
valeriy42 Feb 24, 2020
2aceced
add unit test
valeriy42 Feb 24, 2020
ac75f2b
formatting
valeriy42 Feb 24, 2020
16c7147
reviewers comments
valeriy42 Feb 24, 2020
d429b69
variable renaming
valeriy42 Feb 24, 2020
4407fe6
additional interfaces in maths
valeriy42 Feb 24, 2020
5baa98f
methods added to train interface
valeriy42 Feb 25, 2020
a7d8c32
added unit test for training analysis stats
valeriy42 Feb 26, 2020
c722d7e
Add unit test with schema validation
valeriy42 Feb 27, 2020
513223c
Classification unit test
valeriy42 Feb 27, 2020
b624996
extract training setup bolier plate code
valeriy42 Feb 27, 2020
60e3556
minor refactorings
valeriy42 Feb 27, 2020
38032fd
minor refactorings, add reset
valeriy42 Feb 27, 2020
8ff041f
Formatting and adding docs
valeriy42 Feb 28, 2020
32af0d8
Add Enhancement note
valeriy42 Feb 28, 2020
5c11a61
Rename stub and formatting
valeriy42 Feb 28, 2020
fc7403b
Merge branch 'master' into analysis-stats
valeriy42 Feb 28, 2020
8f25c07
formatting
valeriy42 Feb 28, 2020
7a50fdc
fix missing header
valeriy42 Feb 28, 2020
fac222e
reviewers comments
valeriy42 Mar 10, 2020
245721b
Merge branch 'master' of https://github.com/elastic/ml-cpp into analy…
valeriy42 Mar 10, 2020
ef63e12
Fix merge conflicts
valeriy42 Mar 10, 2020
4d6cf90
fix unit test after merge
valeriy42 Mar 11, 2020
01e933e
Fix unit test build errors
valeriy42 Mar 12, 2020
635e487
add todos
valeriy42 Mar 12, 2020
7a42271
cleaning up
valeriy42 Mar 12, 2020
cc55af5
add debug output
valeriy42 Mar 12, 2020
3f691b4
fix unit test errors
valeriy42 Mar 13, 2020
acbe0fd
formatting
valeriy42 Mar 13, 2020
2e25b9f
Merge branch 'master' of https://github.com/elastic/ml-cpp into analy…
valeriy42 Mar 13, 2020
400695f
fix after merge
valeriy42 Mar 13, 2020
dda4d1b
Merge branch 'master' of https://github.com/elastic/ml-cpp into analy…
valeriy42 Mar 17, 2020
614962c
fix errors after merge
valeriy42 Mar 17, 2020
491ee9e
disable outputting instrumentation data
valeriy42 Mar 17, 2020
0a4276a
disable unit test for unstrumentation
valeriy42 Mar 17, 2020
129030e
license text
valeriy42 Mar 18, 2020
1987e02
fix macosx compile errors
valeriy42 Mar 18, 2020
341c608
writestate uncommented
valeriy42 Mar 18, 2020
fa2c7cb
Fix validation loss schema
valeriy42 Mar 18, 2020
36470c3
deactivate writing state and unit test
valeriy42 Mar 18, 2020
57447f9
fix header
valeriy42 Mar 18, 2020
0216f5b
fix include error
valeriy42 Mar 18, 2020
6134faa
remote analysis_stats as key
valeriy42 Mar 18, 2020
7e7b952
deactivate unit test
valeriy42 Mar 18, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].)
* Remove all memory overheads for computing tree SHAP values. (See {ml-pull}1023[#1023].)
* Distinguish between empty and missing categorical fields in classification and regression
model training. (See {ml-pull}1034[#1034].)
* Add instrumentation information for supervised learning data frame analytics jobs.
(See {ml-pull}1031[#1031].)

=== Bug Fixes

Expand Down
79 changes: 65 additions & 14 deletions include/api/CDataFrameAnalysisInstrumentation.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@

#include <api/ImportExport.h>

#include <rapidjson/document.h>

#include <atomic>
#include <cstdint>
#include <memory>
#include <unordered_map>

namespace ml {
namespace api {
Expand All @@ -28,7 +31,7 @@ namespace api {
//! progress, parameters, quality of results. This also implements the functionality
//! to write the JSON statistics to a specified output stream in a thread safe manner.
class API_EXPORT CDataFrameAnalysisInstrumentation
: public maths::CDataFrameAnalysisInstrumentationInterface {
: virtual public maths::CDataFrameAnalysisInstrumentationInterface {
public:
//! \brief Set the output stream for the lifetime of this object.
class API_EXPORT CScopeSetOutputStream {
Expand All @@ -45,12 +48,13 @@ class API_EXPORT CDataFrameAnalysisInstrumentation
};

public:
//! Constructs an instrumentation object an analytics job with a given \p jobId.
explicit CDataFrameAnalysisInstrumentation(const std::string& jobId);

//! Adds \p delta to the memory usage statistics.
void updateMemoryUsage(std::int64_t delta) override;

//! This adds \p fractionalProgess to the current progress.
//! This adds \p fractionalProgress to the current progress.
//!
//! \note The caller should try to ensure that the sum of the values added
//! at the end of the analysis is equal to one.
Expand All @@ -75,21 +79,27 @@ class API_EXPORT CDataFrameAnalysisInstrumentation

//! Trigger the next step of the job. This will initiate writing the job state
//! to the results pipe.
void nextStep(std::uint32_t step) override;
//! \todo use \p phase to tag different phases of the analysis job.
void nextStep(const std::string& phase = "") override;

//! \return The peak memory usage.
std::int64_t memory() const;

//! \return The id of the data frame analytics job.
const std::string& jobId() const;

protected:
virtual counter_t::ECounterTypes memoryCounterType() = 0;
using TWriter = core::CRapidJsonConcurrentLineWriter;
using TWriterUPtr = std::unique_ptr<TWriter>;

private:
using TWriterUPtr = std::unique_ptr<core::CRapidJsonConcurrentLineWriter>;
protected:
virtual counter_t::ECounterTypes memoryCounterType() = 0;
TWriter* writer();

private:
void writeProgress(std::uint32_t step);
void writeMemory(std::int64_t timestamp);
void writeState(std::uint32_t step);
virtual void writeAnalysisStats(std::int64_t timestamp) = 0;
virtual void writeState();

private:
std::string m_JobId;
Expand All @@ -99,26 +109,67 @@ class API_EXPORT CDataFrameAnalysisInstrumentation
TWriterUPtr m_Writer;
};

//! \brief Outlier instrumentation.
//! \brief Instrumentation class for Outlier Detection jobs.
class API_EXPORT CDataFrameOutliersInstrumentation final
: public CDataFrameAnalysisInstrumentation {
: public CDataFrameAnalysisInstrumentation,
public maths::CDataFrameOutliersInstrumentationInterface {
public:
explicit CDataFrameOutliersInstrumentation(const std::string& jobId)
: CDataFrameAnalysisInstrumentation(jobId) {}

private:
protected:
counter_t::ECounterTypes memoryCounterType() override;

private:
void writeAnalysisStats(std::int64_t timestamp) override;
};

//! \brief Predictive model training instrumentation.
//! \brief Instrumentation class for Supervised Learning jobs.
//!
//! DESCRIPTION:\n
//! This class extends CDataFrameAnalysisInstrumentation with setters
//! for hyperparameters, validation loss results, and job timing.
class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final
: public CDataFrameAnalysisInstrumentation {
: public CDataFrameAnalysisInstrumentation,
public maths::CDataFrameTrainBoostedTreeInstrumentationInterface {
public:
explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId)
: CDataFrameAnalysisInstrumentation(jobId) {}

private:
//! Supervised learning job \p type, can be E_Regression or E_Classification.
void type(EStatsType type) override;
//! Current \p iteration number.
void iteration(std::size_t iteration) override;
//! Run time of the iteration.
void iterationTime(std::uint64_t delta) override;
//! Type of the validation loss result, e.g. "mse".
void lossType(const std::string& lossType) override;
//! List of \p lossValues of validation error for the given \p fold.
void lossValues(std::size_t fold, TDoubleVec&& lossValues) override;
//! \return Structure contains hyperparameters.
SHyperparameters& hyperparameters() override { return m_Hyperparameters; }

protected:
counter_t::ECounterTypes memoryCounterType() override;

private:
using TLossVec = std::vector<std::pair<std::size_t, TDoubleVec>>;

private:
void writeAnalysisStats(std::int64_t timestamp) override;
void writeHyperparameters(rapidjson::Value& parentObject);
void writeValidationLoss(rapidjson::Value& parentObject);
void writeTimingStats(rapidjson::Value& parentObject);
void reset();

private:
EStatsType m_Type;
std::size_t m_Iteration;
std::uint64_t m_IterationTime;
std::uint64_t m_ElapsedTime = 0;
std::string m_LossType;
TLossVec m_LossValues;
SHyperparameters m_Hyperparameters;
};
}
}
Expand Down
2 changes: 1 addition & 1 deletion include/maths/CBoostedTreeFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class MATHS_EXPORT CBoostedTreeFactory final {

//! Set pointer to the analysis instrumentation.
CBoostedTreeFactory&
analysisInstrumentation(CDataFrameAnalysisInstrumentationInterface& instrumentation);
analysisInstrumentation(CDataFrameTrainBoostedTreeInstrumentationInterface& instrumentation);
//! Set the callback function for training state recording.
CBoostedTreeFactory& trainingStateCallback(TTrainingStateCallback callback);

Expand Down
16 changes: 10 additions & 6 deletions include/maths/CBoostedTreeImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TRegularization = CBoostedTreeRegularization<double>;
using TSizeVec = std::vector<std::size_t>;
using TSizeRange = boost::integer_range<std::size_t>;
using TAnalysisInstrumentationPtr = CDataFrameAnalysisInstrumentationInterface*;
using TAnalysisInstrumentationPtr = CDataFrameTrainBoostedTreeInstrumentationInterface*;

public:
static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT;
Expand Down Expand Up @@ -163,7 +163,7 @@ class MATHS_EXPORT CBoostedTreeImpl final {
using TOptionalSize = boost::optional<std::size_t>;
using TPackedBitVectorVec = std::vector<core::CPackedBitVector>;
using TImmutableRadixSetVec = std::vector<core::CImmutableRadixSet<double>>;
using TNodeVecVecDoublePr = std::pair<TNodeVecVec, double>;
using TNodeVecVecDoubleDoubleVecTuple = std::tuple<TNodeVecVec, double, TDoubleVec>;
using TDataFrameCategoryEncoderUPtr = std::unique_ptr<CDataFrameCategoryEncoder>;
using TDataTypeVec = CDataFrameUtils::TDataTypeVec;
using TRegularizationOverride = CBoostedTreeRegularization<TOptionalDouble>;
Expand Down Expand Up @@ -203,10 +203,11 @@ class MATHS_EXPORT CBoostedTreeImpl final {
const core::CPackedBitVector& testingRowMask) const;

//! Train one forest on the rows of \p frame in the mask \p trainingRowMask.
TNodeVecVecDoublePr trainForest(core::CDataFrame& frame,
const core::CPackedBitVector& trainingRowMask,
const core::CPackedBitVector& testingRowMask,
core::CLoopProgress& trainingProgress) const;
TNodeVecVecDoubleDoubleVecTuple
trainForest(core::CDataFrame& frame,
const core::CPackedBitVector& trainingRowMask,
const core::CPackedBitVector& testingRowMask,
core::CLoopProgress& trainingProgress) const;

//! Randomly downsamples the training row mask by the downsample factor.
core::CPackedBitVector downsample(const core::CPackedBitVector& trainingRowMask) const;
Expand Down Expand Up @@ -295,6 +296,9 @@ class MATHS_EXPORT CBoostedTreeImpl final {
//! Record the training state using the \p recordTrainState callback function
void recordState(const TTrainingStateCallback& recordTrainState) const;

//! Record hyperparameters for instrumentation.
void recordHyperparameters();

private:
mutable CPRNG::CXorOShiro128Plus m_Rng;
std::size_t m_NumberThreads;
Expand Down
98 changes: 91 additions & 7 deletions include/maths/CDataFrameAnalysisInstrumentationInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
#ifndef INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h
#define INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h

#include <maths/CBoostedTree.h>
#include <maths/ImportExport.h>

#include <cstdint>
#include <functional>
#include <string>
#include <vector>

namespace ml {
namespace maths {
Expand All @@ -20,7 +24,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
public:
using TProgressCallback = std::function<void(double)>;
using TMemoryUsageCallback = std::function<void(std::int64_t)>;
using TStepCallback = std::function<void(std::uint32_t)>;
using TStepCallback = std::function<void(const std::string&)>;

public:
virtual ~CDataFrameAnalysisInstrumentationInterface() = default;
Expand All @@ -37,7 +41,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
virtual void updateProgress(double fractionalProgress) = 0;
//! Trigger the next step of the job. This will initiate writing the job state
//! to the results pipe.
virtual void nextStep(std::uint32_t step) = 0;
virtual void nextStep(const std::string& phase = "") = 0;
//! Factory for the updateProgress() callback function object.
TProgressCallback progressCallback() {
return [this](double fractionalProgress) {
Expand All @@ -50,16 +54,96 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface {
}
//! Factory for the nextStep() callback function object.
TStepCallback stepCallback() {
return [this](std::uint32_t step) { this->nextStep(step); };
return [this](const std::string& phase) { this->nextStep(phase); };
}
};

//! \brief Dummies out all instrumentation.
class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final
: public CDataFrameAnalysisInstrumentationInterface {
class MATHS_EXPORT CDataFrameOutliersInstrumentationInterface
: virtual public CDataFrameAnalysisInstrumentationInterface {};

//! \brief Instrumentation interface for Supervised Learning jobs.
//!
//! DESCRIPTION:\n
//! This interface extends CDataFrameAnalysisInstrumentationInterface with a setters
//! for hyperparameters, validation loss results, and job timing.
class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface
: virtual public CDataFrameAnalysisInstrumentationInterface {
public:
enum EStatsType { E_Regression, E_Classification };
struct SRegularization {
SRegularization() = default;
SRegularization(double depthPenaltyMultiplier,
double softTreeDepthLimit,
double softTreeDepthTolerance,
double treeSizePenaltyMultiplier,
double leafWeightPenaltyMultiplier)
: s_DepthPenaltyMultiplier{depthPenaltyMultiplier},
s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance},
s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier},
s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}
double s_DepthPenaltyMultiplier = -1.0;
double s_SoftTreeDepthLimit = -1.0;
double s_SoftTreeDepthTolerance = -1.0;
double s_TreeSizePenaltyMultiplier = -1.0;
double s_LeafWeightPenaltyMultiplier = -1.0;
};
struct SHyperparameters {
double s_Eta = -1.0;
CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective =
CBoostedTree::E_MinimumRecall;
SRegularization s_Regularization;
double s_DownsampleFactor = -1.0;
std::size_t s_NumFolds = 0;
std::size_t s_MaxTrees = 0;
double s_FeatureBagFraction = -1.0;
double s_EtaGrowthRatePerTree = -1.0;
std::size_t s_MaxAttemptsToAddTree = 0;
std::size_t s_NumSplitsPerFeature = 0;
std::size_t s_MaxOptimizationRoundsPerHyperparameter = 0;
};
using TDoubleVec = std::vector<double>;

public:
virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default;
//! Supevised learning job \p type, can be E_Regression or E_Classification.
virtual void type(EStatsType type) = 0;
//! Current \p iteration number.
virtual void iteration(std::size_t iteration) = 0;
//! Run time of the iteration.
virtual void iterationTime(std::uint64_t delta) = 0;
//! Type of the validation loss result, e.g. "mse".
virtual void lossType(const std::string& lossType) = 0;
//! List of \p lossValues of validation error for the given \p fold.
virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0;
//! \return Structure contains hyperparameters.
virtual SHyperparameters& hyperparameters() = 0;
};

//! \brief Dummies out all instrumentation for outlier detection.
class MATHS_EXPORT CDataFrameOutliersInstrumentationStub
: public CDataFrameOutliersInstrumentationInterface {
public:
void updateMemoryUsage(std::int64_t) override {}
void updateProgress(double) override {}
void nextStep(std::uint32_t) override {}
void nextStep(const std::string& /* phase */) override {}
};

//! \brief Dummies out all instrumentation for supervised learning.
class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub
: public CDataFrameTrainBoostedTreeInstrumentationInterface {
public:
void updateMemoryUsage(std::int64_t) override {}
void updateProgress(double) override {}
void nextStep(const std::string& /* phase */) override {}
void type(EStatsType /* type */) override {}
void iteration(std::size_t /* iteration */) override {}
void iterationTime(std::uint64_t /* delta */) override {}
void lossType(const std::string& /* lossType */) override {}
void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override {}
SHyperparameters& hyperparameters() override { return m_Hyperparameters; }

private:
SHyperparameters m_Hyperparameters;
};
}
}
Expand Down
2 changes: 1 addition & 1 deletion include/maths/COutliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,7 @@ class MATHS_EXPORT COutliers : private core::CNonInstantiatable {
//! \param[in] instrumentation Manages writing out telemetry.
static void compute(const SComputeParameters& params,
core::CDataFrame& frame,
CDataFrameAnalysisInstrumentationInterface& instrumentation);
CDataFrameOutliersInstrumentationInterface& instrumentation);

//! Estimate the amount of memory that will be used computing outliers
//! for a data frame.
Expand Down
Loading