From a6fc86739de6d04a23a8537470e8a487d5d48428 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 24 Feb 2020 11:40:07 +0100 Subject: [PATCH 01/40] write memory in the new format --- .../api/CDataFrameAnalysisInstrumentation.h | 13 ++++++-- lib/api/CDataFrameAnalysisInstrumentation.cc | 33 ++++++++++++------- lib/api/CDataFrameOutliersRunner.cc | 3 +- lib/api/CDataFrameTrainBoostedTreeRunner.cc | 2 +- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 4a0ccdedd4..5acebea9b0 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -30,7 +30,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation : public maths::CDataFrameAnalysisInstrumentationInterface { public: - CDataFrameAnalysisInstrumentation(); + explicit CDataFrameAnalysisInstrumentation(const std::string& jobId); //! Adds \p delta to the memory usage statistics. void updateMemoryUsage(std::int64_t delta) override; @@ -73,7 +73,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation private: void writeProgress(std::uint32_t step); - void writeMemory(std::uint32_t step); + void writeMemory(std::int64_t timestamp); void writeState(std::uint32_t step); private: @@ -81,16 +81,25 @@ class API_EXPORT CDataFrameAnalysisInstrumentation std::atomic_size_t m_FractionalProgress; std::atomic m_Memory; core::CRapidJsonConcurrentLineWriter* m_Writer; + std::string m_JobId; }; class API_EXPORT CDataFrameOutliersInstrumentation final : public CDataFrameAnalysisInstrumentation { +public: + explicit CDataFrameOutliersInstrumentation(const std::string& jobId) + : CDataFrameAnalysisInstrumentation(jobId){}; + protected: counter_t::ECounterTypes memoryCounterType() override; }; class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final : public CDataFrameAnalysisInstrumentation { +public: + explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) + : CDataFrameAnalysisInstrumentation(jobId){}; + protected: counter_t::ECounterTypes memoryCounterType() override; }; diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 62502f83e8..d14abf5db8 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -3,16 +3,21 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ - #include +#include + namespace ml { namespace api { namespace { const std::string STEP_TAG{"step"}; const std::string PROGRESS_TAG{"progress"}; -const std::string PEAK_MEMORY_USAGE_TAG{"peak_memory_usage"}; +const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"}; +const std::string TYPE_TAG{"type"}; +const std::string JOB_ID_TAG{"job_id"}; +const std::string TIMESTAMP_TAG{"timestamp"}; +const std::string MEMORY_TYPE{"analytics_memory_usage"}; const std::size_t MAXIMUM_FRACTIONAL_PROGRESS{std::size_t{1} << ((sizeof(std::size_t) - 2) * 8)}; @@ -51,8 +56,8 @@ double CDataFrameAnalysisInstrumentation::progress() const { static_cast(MAXIMUM_FRACTIONAL_PROGRESS); } -CDataFrameAnalysisInstrumentation::CDataFrameAnalysisInstrumentation() - : m_Finished{false}, m_FractionalProgress{0}, m_Memory{0}, m_Writer{nullptr} { +CDataFrameAnalysisInstrumentation::CDataFrameAnalysisInstrumentation(const std::string& jobId) + : m_Finished{false}, m_FractionalProgress{0}, m_Memory{0}, m_Writer{nullptr}, m_JobId{jobId} { } void CDataFrameAnalysisInstrumentation::resetProgress() { @@ -64,14 +69,14 @@ void CDataFrameAnalysisInstrumentation::writer(core::CRapidJsonConcurrentLineWri m_Writer = writer; } -void CDataFrameAnalysisInstrumentation::nextStep(std::uint32_t /*step*/) { - // TODO reactivate state writing, once the Java backend can accept it - // this->writeState(step); +void CDataFrameAnalysisInstrumentation::nextStep(std::uint32_t step) { + this->writeState(step); } void CDataFrameAnalysisInstrumentation::writeState(std::uint32_t step) { - this->writeProgress(step); - this->writeMemory(step); + // this->writeProgress(step); + int64_t timestamp = core::CTimeUtils::toEpochMs(core::CTimeUtils::now()); + this->writeMemory(timestamp); } std::int64_t CDataFrameAnalysisInstrumentation::memory() const { @@ -89,11 +94,15 @@ void CDataFrameAnalysisInstrumentation::writeProgress(std::uint32_t step) { } } -void CDataFrameAnalysisInstrumentation::writeMemory(std::uint32_t step) { +void CDataFrameAnalysisInstrumentation::writeMemory(std::int64_t timestamp) { if (m_Writer != nullptr) { m_Writer->StartObject(); - m_Writer->Key(STEP_TAG); - m_Writer->Uint(step); + m_Writer->Key(TYPE_TAG); + m_Writer->String(MEMORY_TYPE); + m_Writer->Key(JOB_ID_TAG); + m_Writer->String(m_JobId); + m_Writer->Key(TIMESTAMP_TAG); + m_Writer->Int64(timestamp); m_Writer->Key(PEAK_MEMORY_USAGE_TAG); m_Writer->Int64(m_Memory.load()); m_Writer->EndObject(); diff --git a/lib/api/CDataFrameOutliersRunner.cc b/lib/api/CDataFrameOutliersRunner.cc index 406d23b7a8..7e5d698292 100644 --- a/lib/api/CDataFrameOutliersRunner.cc +++ b/lib/api/CDataFrameOutliersRunner.cc @@ -73,7 +73,8 @@ CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpeci CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec) : CDataFrameAnalysisRunner{spec}, m_Method{static_cast( - maths::COutliers::E_Ensemble)} { + maths::COutliers::E_Ensemble)}, + m_Instrumentation{spec.jobId()} { } std::size_t CDataFrameOutliersRunner::numberExtraColumns() const { diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc index d4ad4fd8fc..ef7d4fc59c 100644 --- a/lib/api/CDataFrameTrainBoostedTreeRunner.cc +++ b/lib/api/CDataFrameTrainBoostedTreeRunner.cc @@ -68,7 +68,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner( const CDataFrameAnalysisSpecification& spec, const CDataFrameAnalysisParameters& parameters, TLossFunctionUPtr loss) - : CDataFrameAnalysisRunner{spec} { + : CDataFrameAnalysisRunner{spec}, m_Instrumentation{spec.jobId()} { m_DependentVariableFieldName = parameters[DEPENDENT_VARIABLE_NAME].as(); From 2aceced4dc950d193aebd97ea131784fa102345c Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 24 Feb 2020 12:24:57 +0100 Subject: [PATCH 02/40] add unit test --- .../CDataFrameAnalysisInstrumentationTest.cc | 46 +++++++++++++++++++ .../CDataFrameAnalyzerTrainingTest.cc | 2 +- .../unittest/CDataFrameMockAnalysisRunner.cc | 2 +- .../unittest/CDataFrameMockAnalysisRunner.h | 8 +++- 4 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc new file mode 100644 index 0000000000..96b129d5c4 --- /dev/null +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include + +#include + +#include + +BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) + +using namespace ml; + +BOOST_AUTO_TEST_CASE(testMemoryState) { + std::string jobId("JOB123"); + std::int64_t memoryUsage = 1000; + std::int64_t timestamp = core::CTimeUtils::toEpochMs(core::CTimeUtils::now()); + std::stringstream s_Output; + { + core::CJsonOutputStreamWrapper streamWrapper(s_Output); + core::CRapidJsonConcurrentLineWriter writer(streamWrapper); + api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); + instrumentation.updateMemoryUsage(memoryUsage); + instrumentation.writer(&writer); + instrumentation.nextStep(0); + s_Output.flush(); + } + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(s_Output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + BOOST_TEST_REQUIRE(results.IsArray() == true); + + const auto& result{results[0]}; + BOOST_TEST_REQUIRE(result["job_id"].GetString() == jobId); + BOOST_TEST_REQUIRE(result["type"].GetString() == "analytics_memory_usage"); + BOOST_TEST_REQUIRE(result["peak_usage_bytes"].GetInt64() == memoryUsage); + BOOST_REQUIRE_SMALL(result["timestamp"].GetInt64() - timestamp, 10l); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index b80144af63..c2752a53a7 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -265,7 +265,7 @@ void addPredictionTestData(EPredictionType type, treeFactory.featureBagFraction(featureBagFraction); } - ml::api::CDataFrameTrainBoostedTreeInstrumentation instrumentation; + ml::api::CDataFrameTrainBoostedTreeInstrumentation instrumentation("testJob"); treeFactory.analysisInstrumentation(instrumentation); auto tree = treeFactory.buildFor(*frame, weights.size()); diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.cc b/lib/api/unittest/CDataFrameMockAnalysisRunner.cc index b573e8299f..13bca6ec82 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.cc +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.cc @@ -10,7 +10,7 @@ #include CDataFrameMockAnalysisRunner::CDataFrameMockAnalysisRunner(const ml::api::CDataFrameAnalysisSpecification& spec) - : ml::api::CDataFrameAnalysisRunner{spec} { + : ml::api::CDataFrameAnalysisRunner{spec}, m_Instrumentation{spec.jobId()} { } std::size_t CDataFrameMockAnalysisRunner::numberExtraColumns() const { diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.h b/lib/api/unittest/CDataFrameMockAnalysisRunner.h index f74d24985a..cfa648abfd 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.h +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.h @@ -16,8 +16,12 @@ #include class CDataFrameMockAnalysisState final : public ml::api::CDataFrameAnalysisInstrumentation { -protected: - ml::counter_t::ECounterTypes memoryCounterType() override; + public: + CDataFrameMockAnalysisState(const std::string& jobId) + : ml::api::CDataFrameAnalysisInstrumentation(jobId) {} + + protected: + ml::counter_t::ECounterTypes memoryCounterType() override; }; class CDataFrameMockAnalysisRunner final : public ml::api::CDataFrameAnalysisRunner { From ac75f2b1e30da53980ce6054ec040a380fa95b4b Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 24 Feb 2020 12:47:51 +0100 Subject: [PATCH 03/40] formatting --- .../CDataFrameAnalysisInstrumentationTest.cc | 14 +++++++------- lib/api/unittest/CDataFrameMockAnalysisRunner.h | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 96b129d5c4..fbb75842ac 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -22,13 +22,13 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { std::int64_t timestamp = core::CTimeUtils::toEpochMs(core::CTimeUtils::now()); std::stringstream s_Output; { - core::CJsonOutputStreamWrapper streamWrapper(s_Output); - core::CRapidJsonConcurrentLineWriter writer(streamWrapper); - api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); - instrumentation.updateMemoryUsage(memoryUsage); - instrumentation.writer(&writer); - instrumentation.nextStep(0); - s_Output.flush(); + core::CJsonOutputStreamWrapper streamWrapper(s_Output); + core::CRapidJsonConcurrentLineWriter writer(streamWrapper); + api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); + instrumentation.updateMemoryUsage(memoryUsage); + instrumentation.writer(&writer); + instrumentation.nextStep(0); + s_Output.flush(); } rapidjson::Document results; diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.h b/lib/api/unittest/CDataFrameMockAnalysisRunner.h index cfa648abfd..b35f9e4a5f 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.h +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.h @@ -16,12 +16,12 @@ #include class CDataFrameMockAnalysisState final : public ml::api::CDataFrameAnalysisInstrumentation { - public: - CDataFrameMockAnalysisState(const std::string& jobId) - : ml::api::CDataFrameAnalysisInstrumentation(jobId) {} +public: + CDataFrameMockAnalysisState(const std::string& jobId) + : ml::api::CDataFrameAnalysisInstrumentation(jobId) {} - protected: - ml::counter_t::ECounterTypes memoryCounterType() override; +protected: + ml::counter_t::ECounterTypes memoryCounterType() override; }; class CDataFrameMockAnalysisRunner final : public ml::api::CDataFrameAnalysisRunner { From 16c7147c1fad3e50b69602702d66d0bf21bcb394 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 24 Feb 2020 13:50:22 +0100 Subject: [PATCH 04/40] reviewers comments --- lib/api/CDataFrameAnalysisInstrumentation.cc | 2 +- .../unittest/CDataFrameAnalysisInstrumentationTest.cc | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index d14abf5db8..1c36fcd8d1 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -75,7 +75,7 @@ void CDataFrameAnalysisInstrumentation::nextStep(std::uint32_t step) { void CDataFrameAnalysisInstrumentation::writeState(std::uint32_t step) { // this->writeProgress(step); - int64_t timestamp = core::CTimeUtils::toEpochMs(core::CTimeUtils::now()); + std::int64_t timestamp{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; this->writeMemory(timestamp); } diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index fbb75842ac..842f2d2aa9 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -17,9 +17,9 @@ BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) using namespace ml; BOOST_AUTO_TEST_CASE(testMemoryState) { - std::string jobId("JOB123"); - std::int64_t memoryUsage = 1000; - std::int64_t timestamp = core::CTimeUtils::toEpochMs(core::CTimeUtils::now()); + std::string jobId{"JOB123"}; + std::int64_t memoryUsage{1000}; + std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; std::stringstream s_Output; { core::CJsonOutputStreamWrapper streamWrapper(s_Output); @@ -30,6 +30,7 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { instrumentation.nextStep(0); s_Output.flush(); } + std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; rapidjson::Document results; rapidjson::ParseResult ok(results.Parse(s_Output.str())); @@ -40,7 +41,8 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { BOOST_TEST_REQUIRE(result["job_id"].GetString() == jobId); BOOST_TEST_REQUIRE(result["type"].GetString() == "analytics_memory_usage"); BOOST_TEST_REQUIRE(result["peak_usage_bytes"].GetInt64() == memoryUsage); - BOOST_REQUIRE_SMALL(result["timestamp"].GetInt64() - timestamp, 10l); + BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() >= timeBefore); + BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() <= timeAfter); } BOOST_AUTO_TEST_SUITE_END() From d429b69e36e39442b0dcbdd4e6728640854e5dcb Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 24 Feb 2020 13:51:25 +0100 Subject: [PATCH 05/40] variable renaming --- lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 842f2d2aa9..5e030c1d20 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -20,20 +20,20 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { std::string jobId{"JOB123"}; std::int64_t memoryUsage{1000}; std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - std::stringstream s_Output; + std::stringstream outpustStream; { - core::CJsonOutputStreamWrapper streamWrapper(s_Output); + core::CJsonOutputStreamWrapper streamWrapper(outpustStream); core::CRapidJsonConcurrentLineWriter writer(streamWrapper); api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); instrumentation.updateMemoryUsage(memoryUsage); instrumentation.writer(&writer); instrumentation.nextStep(0); - s_Output.flush(); + outpustStream.flush(); } std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(s_Output.str())); + rapidjson::ParseResult ok(results.Parse(outpustStream.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); BOOST_TEST_REQUIRE(results.IsArray() == true); From 4407fe637a4b47efab0d07b2529a383ef6868cd7 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Mon, 24 Feb 2020 15:14:54 +0100 Subject: [PATCH 06/40] additional interfaces in maths --- .../api/CDataFrameAnalysisInstrumentation.h | 18 +++++-- include/maths/CBoostedTreeFactory.h | 2 +- include/maths/CBoostedTreeImpl.h | 2 +- ...ataFrameAnalysisInstrumentationInterface.h | 8 ++- include/maths/COutliers.h | 2 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 50 ++++++++++++++++++- lib/maths/CBoostedTreeFactory.cc | 2 +- lib/maths/COutliers.cc | 2 +- 8 files changed, 75 insertions(+), 11 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 5acebea9b0..9102e5ff0d 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -27,7 +27,7 @@ namespace api { //! progress, parameters, quality of results. The class also implements the functionality to //! write the state at different iteration into the results pipe. class API_EXPORT CDataFrameAnalysisInstrumentation - : public maths::CDataFrameAnalysisInstrumentationInterface { + : virtual public maths::CDataFrameAnalysisInstrumentationInterface { public: explicit CDataFrameAnalysisInstrumentation(const std::string& jobId); @@ -68,12 +68,16 @@ class API_EXPORT CDataFrameAnalysisInstrumentation //! \return The peak memory usage. std::int64_t memory() const; + const std::string& jobId() const; + protected: virtual counter_t::ECounterTypes memoryCounterType() = 0; + core::CRapidJsonConcurrentLineWriter* writer(); private: void writeProgress(std::uint32_t step); void writeMemory(std::int64_t timestamp); + virtual void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) = 0; void writeState(std::uint32_t step); private: @@ -85,23 +89,31 @@ class API_EXPORT CDataFrameAnalysisInstrumentation }; class API_EXPORT CDataFrameOutliersInstrumentation final - : public CDataFrameAnalysisInstrumentation { + : public CDataFrameAnalysisInstrumentation, + public maths::CDataFrameOutliersInstrumentationInterface { public: explicit CDataFrameOutliersInstrumentation(const std::string& jobId) : CDataFrameAnalysisInstrumentation(jobId){}; protected: counter_t::ECounterTypes memoryCounterType() override; + +private: + void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) override; }; class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final - : public CDataFrameAnalysisInstrumentation { + : public CDataFrameAnalysisInstrumentation, + public maths::CDataFrameTrainBoostedTreeInstrumentationInterface { public: explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) : CDataFrameAnalysisInstrumentation(jobId){}; protected: counter_t::ECounterTypes memoryCounterType() override; + +private: + void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) override; }; } } diff --git a/include/maths/CBoostedTreeFactory.h b/include/maths/CBoostedTreeFactory.h index 25695d6e6f..be2abfad29 100644 --- a/include/maths/CBoostedTreeFactory.h +++ b/include/maths/CBoostedTreeFactory.h @@ -99,7 +99,7 @@ class MATHS_EXPORT CBoostedTreeFactory final { //! Set pointer to the analysis instrumentation. CBoostedTreeFactory& - analysisInstrumentation(CDataFrameAnalysisInstrumentationInterface& instrumentation); + analysisInstrumentation(CDataFrameTrainBoostedTreeInstrumentationInterface& instrumentation); //! Set the callback function for training state recording. CBoostedTreeFactory& trainingStateCallback(TTrainingStateCallback callback); diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index bec3da63d0..c6f88b0aae 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -61,7 +61,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TRegularization = CBoostedTreeRegularization; using TSizeVec = std::vector; using TSizeRange = boost::integer_range; - using TAnalysisInstrumentationPtr = CDataFrameAnalysisInstrumentationInterface*; + using TAnalysisInstrumentationPtr = CDataFrameTrainBoostedTreeInstrumentationInterface*; public: static const double MINIMUM_RELATIVE_GAIN_PER_SPLIT; diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 416882d424..6373471dcb 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -54,9 +54,15 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface { } }; +class MATHS_EXPORT CDataFrameOutliersInstrumentationInterface + : virtual public CDataFrameAnalysisInstrumentationInterface {}; + +class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface + : virtual public CDataFrameAnalysisInstrumentationInterface {}; + //! \brief Dummies out all instrumentation. class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final - : public CDataFrameAnalysisInstrumentationInterface { + : public CDataFrameTrainBoostedTreeInstrumentationInterface { void updateMemoryUsage(std::int64_t) override {} void updateProgress(double) override {} void nextStep(std::uint32_t) override {} diff --git a/include/maths/COutliers.h b/include/maths/COutliers.h index 227b898a23..f0c764871b 100644 --- a/include/maths/COutliers.h +++ b/include/maths/COutliers.h @@ -693,7 +693,7 @@ class MATHS_EXPORT COutliers : private core::CNonInstantiatable { //! \param[in] instrumentation Manages writing out telemetry. static void compute(const SComputeParameters& params, core::CDataFrame& frame, - CDataFrameAnalysisInstrumentationInterface& instrumentation); + CDataFrameOutliersInstrumentationInterface& instrumentation); //! Estimate the amount of memory that will be used computing outliers //! for a data frame. diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 1c36fcd8d1..250a402937 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -5,6 +5,7 @@ */ #include +#include #include namespace ml { @@ -17,7 +18,18 @@ const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"}; const std::string TYPE_TAG{"type"}; const std::string JOB_ID_TAG{"job_id"}; const std::string TIMESTAMP_TAG{"timestamp"}; -const std::string MEMORY_TYPE{"analytics_memory_usage"}; +const std::string MEMORY_TYPE_TAG{"analytics_memory_usage"}; +const std::string ANALYSIS_TYPE_TAG{"analysis_stats"}; +const std::string REGRESSION_STATS_TAG{"regression_stats"}; +const std::string ITERATION_TAG{"iteration"}; +const std::string HYPERPARAMETERS_TAG{"hyperparameters"}; +const std::string VALIDATION_LOSS_TAG{"validation_loss"}; +const std::string TIMING_STATS_TAG{"timing_stats"}; +const std::string VALIDATION_LOSS_TYPE_TAG{"loss_type"}; +const std::string VALIDATION_LOSS_VALUES_TAG{"values"}; +const std::string VALIDATION_NUM_FOLDS_TAG{"num_folds"}; +const std::string TIMING_ELAPSED_TIME_TAG{"elapsed_time"}; +const std::string TIMING_ITERATION_TIME_TAG{"iteration_time"}; const std::size_t MAXIMUM_FRACTIONAL_PROGRESS{std::size_t{1} << ((sizeof(std::size_t) - 2) * 8)}; @@ -98,7 +110,7 @@ void CDataFrameAnalysisInstrumentation::writeMemory(std::int64_t timestamp) { if (m_Writer != nullptr) { m_Writer->StartObject(); m_Writer->Key(TYPE_TAG); - m_Writer->String(MEMORY_TYPE); + m_Writer->String(MEMORY_TYPE_TAG); m_Writer->Key(JOB_ID_TAG); m_Writer->String(m_JobId); m_Writer->Key(TIMESTAMP_TAG); @@ -109,6 +121,14 @@ void CDataFrameAnalysisInstrumentation::writeMemory(std::int64_t timestamp) { } } +const std::string& CDataFrameAnalysisInstrumentation::jobId() const { + return m_JobId; +} + +core::CRapidJsonConcurrentLineWriter* CDataFrameAnalysisInstrumentation::writer() { + return m_Writer; +} + counter_t::ECounterTypes CDataFrameOutliersInstrumentation::memoryCounterType() { return counter_t::E_DFOPeakMemoryUsage; } @@ -116,5 +136,31 @@ counter_t::ECounterTypes CDataFrameOutliersInstrumentation::memoryCounterType() counter_t::ECounterTypes CDataFrameTrainBoostedTreeInstrumentation::memoryCounterType() { return counter_t::E_DFTPMPeakMemoryUsage; } + +void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestamp, + std::uint32_t /*step*/) { + auto* writer{this->writer()}; + if (writer != nullptr) { + writer->StartObject(); + writer->Key(JOB_ID_TAG); + writer->String(this->jobId()); + writer->Key(TIMESTAMP_TAG); + writer->Int64(timestamp); + writer->EndObject(); + } +} + +void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp, + std::uint32_t step) { + auto* writer{this->writer()}; + if (writer != nullptr) { + writer->StartObject(); + writer->Key(JOB_ID_TAG); + writer->String(this->jobId()); + writer->Key(TIMESTAMP_TAG); + writer->Int64(timestamp); + writer->EndObject(); + } +} } } diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index bf12646bd8..8da0e6cd09 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -1131,7 +1131,7 @@ CBoostedTreeFactory& CBoostedTreeFactory::topShapValues(std::size_t topShapValue } CBoostedTreeFactory& CBoostedTreeFactory::analysisInstrumentation( - CDataFrameAnalysisInstrumentationInterface& instrumentation) { + CDataFrameTrainBoostedTreeInstrumentationInterface& instrumentation) { m_TreeImpl->m_Instrumentation = &instrumentation; return *this; } diff --git a/lib/maths/COutliers.cc b/lib/maths/COutliers.cc index b5bc28acb9..825b2cc687 100644 --- a/lib/maths/COutliers.cc +++ b/lib/maths/COutliers.cc @@ -1050,7 +1050,7 @@ bool computeOutliersPartitioned(const COutliers::SComputeParameters& params, void COutliers::compute(const SComputeParameters& params, core::CDataFrame& frame, - CDataFrameAnalysisInstrumentationInterface& instrumentation) { + CDataFrameOutliersInstrumentationInterface& instrumentation) { if (params.s_StandardizeColumns) { CDataFrameUtils::standardizeColumns(params.s_NumberThreads, frame); From 5baa98ff12f059f49f8f4e01278d0ed84ef36834 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Tue, 25 Feb 2020 12:05:07 +0100 Subject: [PATCH 07/40] methods added to train interface --- .../api/CDataFrameAnalysisInstrumentation.h | 13 ++++ include/maths/CBoostedTreeImpl.h | 13 ++-- ...ataFrameAnalysisInstrumentationInterface.h | 66 ++++++++++++++++++- lib/maths/CBoostedTreeFactory.cc | 6 +- lib/maths/CBoostedTreeImpl.cc | 49 ++++++++++++-- 5 files changed, 132 insertions(+), 15 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 9102e5ff0d..86f9e765bd 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -109,11 +109,24 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) : CDataFrameAnalysisInstrumentation(jobId){}; + void type(EStatsType /* type */) override{}; + void iteration(std::size_t /* iteration */) override{}; + void startTime(std::uint64_t /* timestamp */) override{}; + void iterationTime(std::uint64_t /* delta */) override{}; + void lossType(const std::string& /* lossType */) override{}; + void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override{}; + void numFolds(std::size_t /* numFolds */) override{}; + void hyperparameters(const SHyperparameters& /* hyperparameters */) override{}; + SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; + protected: counter_t::ECounterTypes memoryCounterType() override; private: void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) override; + +private: + SHyperparameters m_Hyperparameters; }; } } diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index c6f88b0aae..567741c36c 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -162,7 +162,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { using TVector = CDenseVector; using TPackedBitVectorVec = std::vector; using TImmutableRadixSetVec = std::vector>; - using TNodeVecVecDoublePr = std::pair; + using TNodeVecVecDoubleDoubleVecTuple = std::tuple; using TDataFrameCategoryEncoderUPtr = std::unique_ptr; using TDataTypeVec = CDataFrameUtils::TDataTypeVec; using TRegularizationOverride = CBoostedTreeRegularization; @@ -198,10 +198,11 @@ class MATHS_EXPORT CBoostedTreeImpl final { const core::CPackedBitVector& testingRowMask) const; //! Train one forest on the rows of \p frame in the mask \p trainingRowMask. - TNodeVecVecDoublePr trainForest(core::CDataFrame& frame, - const core::CPackedBitVector& trainingRowMask, - const core::CPackedBitVector& testingRowMask, - core::CLoopProgress& trainingProgress) const; + TNodeVecVecDoubleDoubleVecTuple + trainForest(core::CDataFrame& frame, + const core::CPackedBitVector& trainingRowMask, + const core::CPackedBitVector& testingRowMask, + core::CLoopProgress& trainingProgress) const; //! Randomly downsamples the training row mask by the downsample factor. core::CPackedBitVector downsample(const core::CPackedBitVector& trainingRowMask) const; @@ -293,6 +294,8 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Populate numberSamples field in the m_BestForest void computeNumberSamples(const core::CDataFrame& frame); + void recordHyperparameters(); + private: mutable CPRNG::CXorOShiro128Plus m_Rng; std::size_t m_NumberThreads; diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 6373471dcb..0f98234b3b 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -7,9 +7,13 @@ #ifndef INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h #define INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h +#include #include #include +#include +#include +#include namespace ml { namespace maths { @@ -58,14 +62,74 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationInterface : virtual public CDataFrameAnalysisInstrumentationInterface {}; class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface - : virtual public CDataFrameAnalysisInstrumentationInterface {}; + : virtual public CDataFrameAnalysisInstrumentationInterface { +public: + enum EStatsType { E_Regression, E_Classification }; + struct SRegularization { + SRegularization() = default; + SRegularization(double depthPenaltyMultiplier, + double softTreeDepthLimit, + double softTreeDepthTolerance, + double treeSizePenaltyMultiplier, + double leafWeightPenaltyMultiplier) + : s_DepthPenaltyMultiplier{depthPenaltyMultiplier}, + s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance}, + s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier}, + s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}; + double s_DepthPenaltyMultiplier = 0.0; + double s_SoftTreeDepthLimit = 0.0; + double s_SoftTreeDepthTolerance = 0.0; + double s_TreeSizePenaltyMultiplier = 0.0; + double s_LeafWeightPenaltyMultiplier = 0.0; + }; + struct SHyperparameters { + double s_Eta = 0.1; + CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective = + CBoostedTree::E_MinimumRecall; + SRegularization s_Regularization; + double s_DownsampleFactor = 0.5; + std::size_t s_NumFolds = 4; + std::size_t s_MaxTrees = 20; + double s_FeatureBagFraction = 0.5; + double s_EtaGrowthRatePerTree = 1.05; + std::size_t s_MaxAttemptsToAddTree = 3; + std::size_t s_NumSplitsPerFeature = 75; + std::size_t s_MaxOptimizationRoundsPerHyperparameter = 2; + }; + using TDoubleVec = std::vector; + +public: + virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default; + virtual void type(EStatsType type) = 0; + virtual void iteration(std::size_t iteration) = 0; + virtual void startTime(std::uint64_t timestamp) = 0; + virtual void iterationTime(std::uint64_t delta) = 0; + virtual void lossType(const std::string& lossType) = 0; + virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0; + virtual void numFolds(std::size_t numFolds) = 0; + virtual void hyperparameters(const SHyperparameters& hyperparameters) = 0; + virtual SHyperparameters& hyperparameters() = 0; +}; //! \brief Dummies out all instrumentation. class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final : public CDataFrameTrainBoostedTreeInstrumentationInterface { +public: void updateMemoryUsage(std::int64_t) override {} void updateProgress(double) override {} void nextStep(std::uint32_t) override {} + void type(EStatsType /* type */) override{}; + void iteration(std::size_t /* iteration */) override{}; + void startTime(std::uint64_t /* timestamp */) override{}; + void iterationTime(std::uint64_t /* delta */) override{}; + void lossType(const std::string& /* lossType */) override{}; + void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override{}; + void numFolds(std::size_t /* numFolds */) override{}; + void hyperparameters(const SHyperparameters& /* hyperparameters */) override{}; + SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; + +private: + SHyperparameters m_Hyperparameters; }; } } diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index 8da0e6cd09..a5b1974172 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -752,7 +752,7 @@ CBoostedTreeFactory::estimateTreeGainAndCurvature(core::CDataFrame& frame, std::size_t maximumNumberOfTrees{1}; std::swap(maximumNumberOfTrees, m_TreeImpl->m_MaximumNumberTrees); CBoostedTreeImpl::TNodeVecVec forest; - std::tie(forest, std::ignore) = m_TreeImpl->trainForest( + std::tie(forest, std::ignore, std::ignore) = m_TreeImpl->trainForest( frame, m_TreeImpl->m_TrainingRowMasks[0], m_TreeImpl->m_TestingRowMasks[0], m_TreeImpl->m_TrainingProgress); std::swap(maximumNumberOfTrees, m_TreeImpl->m_MaximumNumberTrees); @@ -820,7 +820,7 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, CBoostedTreeImpl::TNodeVecVec forest; double testLoss; - std::tie(forest, testLoss) = m_TreeImpl->trainForest( + std::tie(forest, testLoss, std::ignore) = m_TreeImpl->trainForest( frame, m_TreeImpl->m_TrainingRowMasks[0], m_TreeImpl->m_TestingRowMasks[0], m_TreeImpl->m_TrainingProgress); bopt.add(boptVector(regularizer), testLoss, 0.0); @@ -841,7 +841,7 @@ CBoostedTreeFactory::testLossLineSearch(core::CDataFrame& frame, } CBoostedTreeImpl::TNodeVecVec forest; double testLoss; - std::tie(forest, testLoss) = m_TreeImpl->trainForest( + std::tie(forest, testLoss, std::ignore) = m_TreeImpl->trainForest( frame, m_TreeImpl->m_TrainingRowMasks[0], m_TreeImpl->m_TestingRowMasks[0], m_TreeImpl->m_TrainingProgress); bopt.add(regularizer, testLoss, 0.0); diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index f2d261942d..06f9db22da 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -4,6 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ +#include #include #include @@ -185,6 +186,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, core::CStopWatch stopWatch; stopWatch.start(); std::uint64_t lastLap{stopWatch.lap()}; + m_Instrumentation->startTime(lastLap); // Hyperparameter optimisation loop. @@ -193,6 +195,9 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, while (m_CurrentRound < m_NumberRounds) { LOG_TRACE(<< "Optimisation round = " << m_CurrentRound + 1); + m_Instrumentation->iteration(m_CurrentRound + 1); + + this->recordHyperparameters(); TMeanVarAccumulator lossMoments; std::size_t maximumNumberTrees; @@ -216,7 +221,10 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, LOG_TRACE(<< "Round " << m_CurrentRound << " state recording finished"); std::uint64_t currentLap{stopWatch.lap()}; - timeAccumulator.add(static_cast(currentLap - lastLap)); + std::uint64_t delta = currentLap - lastLap; + m_Instrumentation->iterationTime(delta); + + timeAccumulator.add(static_cast(delta)); lastLap = currentLap; m_Instrumentation->nextStep(static_cast(m_CurrentRound)); } @@ -224,7 +232,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, LOG_TRACE(<< "Test loss = " << m_BestForestTestLoss); this->restoreBestHyperparameters(); - std::tie(m_BestForest, std::ignore) = this->trainForest( + std::tie(m_BestForest, std::ignore, std::ignore) = this->trainForest( frame, allTrainingRowsMask, allTrainingRowsMask, m_TrainingProgress); m_Instrumentation->nextStep(static_cast(m_CurrentRound)); @@ -449,13 +457,15 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { folds.pop_back(); TNodeVecVec forest; double loss; - std::tie(forest, loss) = this->trainForest( + TDoubleVec lossValues; + std::tie(forest, loss, lossValues) = this->trainForest( frame, m_TrainingRowMasks[fold], m_TestingRowMasks[fold], m_TrainingProgress); LOG_TRACE(<< "fold = " << fold << " forest size = " << forest.size() << " test set loss = " << loss); lossMoments.add(loss); m_FoldRoundTestLosses[fold][m_CurrentRound] = loss; numberTrees.push_back(static_cast(forest.size())); + m_Instrumentation->lossValues(fold, std::move(lossValues)); } m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size()); LOG_TRACE(<< "skipped " << folds.size() << " folds"); @@ -496,7 +506,7 @@ CBoostedTreeImpl::TNodeVec CBoostedTreeImpl::initializePredictionsAndLossDerivat return tree; } -CBoostedTreeImpl::TNodeVecVecDoublePr +CBoostedTreeImpl::TNodeVecVecDoubleDoubleVecTuple CBoostedTreeImpl::trainForest(core::CDataFrame& frame, const core::CPackedBitVector& trainingRowMask, const core::CPackedBitVector& testingRowMask, @@ -537,6 +547,9 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, scopeMemoryUsage.add(candidateSplits); std::size_t retries{0}; + + TDoubleVec losses; + losses.reserve(m_MaximumNumberTrees); CTrainForestStoppingCondition stoppingCondition{m_MaximumNumberTrees}; do { auto tree = this->trainTree(frame, downsampledRowMask, candidateSplits, maximumTreeSize); @@ -570,7 +583,10 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, std::max(0.5 / eta, MINIMUM_SPLIT_REFRESH_INTERVAL)); } } while (stoppingCondition.shouldStop(forest.size(), [&]() { - return this->meanLoss(frame, testingRowMask); + // TODO store loss values here somewhere??? + double loss = this->meanLoss(frame, testingRowMask); + losses.push_back(loss); + return loss; }) == false); LOG_TRACE(<< "Stopped at " << forest.size() - 1 << "/" << m_MaximumNumberTrees); @@ -582,7 +598,7 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, LOG_TRACE(<< "Trained one forest"); - return {forest, stoppingCondition.bestLoss()}; + return {forest, stoppingCondition.bestLoss(), losses}; } core::CPackedBitVector @@ -1195,6 +1211,27 @@ std::size_t CBoostedTreeImpl::maximumTreeSize(std::size_t numberRows) const { std::ceil(10.0 * std::sqrt(static_cast(numberRows)))); } +void CBoostedTreeImpl::recordHyperparameters() { + m_Instrumentation->hyperparameters().s_Eta = m_Eta; + m_Instrumentation->hyperparameters().s_ClassAssignmentObjective = m_ClassAssignmentObjective; + m_Instrumentation->hyperparameters().s_DownsampleFactor = m_DownsampleFactor; + m_Instrumentation->hyperparameters().s_NumFolds = m_NumberFolds; + m_Instrumentation->hyperparameters().s_MaxTrees = m_MaximumNumberTrees; + m_Instrumentation->hyperparameters().s_FeatureBagFraction = m_FeatureBagFraction; + m_Instrumentation->hyperparameters().s_EtaGrowthRatePerTree = m_EtaGrowthRatePerTree; + m_Instrumentation->hyperparameters().s_MaxAttemptsToAddTree = m_MaximumAttemptsToAddTree; + m_Instrumentation->hyperparameters().s_NumSplitsPerFeature = m_NumberSplitsPerFeature; + m_Instrumentation->hyperparameters().s_MaxOptimizationRoundsPerHyperparameter = + m_MaximumOptimisationRoundsPerHyperparameter; + m_Instrumentation->hyperparameters().s_Regularization = + CDataFrameTrainBoostedTreeInstrumentationInterface::SRegularization{ + m_Regularization.depthPenaltyMultiplier(), + m_Regularization.softTreeDepthLimit(), + m_Regularization.softTreeDepthTolerance(), + m_Regularization.treeSizePenaltyMultiplier(), + m_Regularization.leafWeightPenaltyMultiplier()}; +} + namespace { const std::string VERSION_7_7_TAG{"7.7"}; const TStrVec SUPPORTED_VERSIONS{VERSION_7_7_TAG}; From a7d8c32f57562ad1436718bddb3b9cd982b05f71 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 26 Feb 2020 10:23:36 +0100 Subject: [PATCH 08/40] added unit test for training analysis stats --- .../api/CDataFrameAnalysisInstrumentation.h | 17 ++- ...ataFrameAnalysisInstrumentationInterface.h | 9 ++ lib/api/CDataFrameAnalysisInstrumentation.cc | 106 ++++++++++++++++-- .../CDataFrameAnalysisInstrumentationTest.cc | 28 +++++ .../unittest/CDataFrameAnalyzerOutlierTest.cc | 2 +- lib/api/unittest/Makefile | 1 + 6 files changed, 144 insertions(+), 19 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 86f9e765bd..b5d56d1747 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -16,6 +16,7 @@ #include #include +#include namespace ml { namespace api { @@ -28,6 +29,8 @@ namespace api { //! write the state at different iteration into the results pipe. class API_EXPORT CDataFrameAnalysisInstrumentation : virtual public maths::CDataFrameAnalysisInstrumentationInterface { +public: + using TRapidJsonWriter = core::CRapidJsonConcurrentLineWriter; public: explicit CDataFrameAnalysisInstrumentation(const std::string& jobId); @@ -59,7 +62,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation void resetProgress(); //! Set pointer to the writer object. - void writer(core::CRapidJsonConcurrentLineWriter* writer); + void writer(TRapidJsonWriter* writer); //! Trigger the next step of the job. This will initiate writing the job state //! to the results pipe. @@ -72,19 +75,18 @@ class API_EXPORT CDataFrameAnalysisInstrumentation protected: virtual counter_t::ECounterTypes memoryCounterType() = 0; - core::CRapidJsonConcurrentLineWriter* writer(); + TRapidJsonWriter* writer(); private: - void writeProgress(std::uint32_t step); void writeMemory(std::int64_t timestamp); - virtual void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) = 0; - void writeState(std::uint32_t step); + virtual void writeAnalysisStats(std::int64_t /* timestamp */, std::uint32_t /* step */) {}; + virtual void writeState(std::uint32_t step); private: std::atomic_bool m_Finished; std::atomic_size_t m_FractionalProgress; std::atomic m_Memory; - core::CRapidJsonConcurrentLineWriter* m_Writer; + TRapidJsonWriter* m_Writer; std::string m_JobId; }; @@ -124,6 +126,9 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final private: void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) override; + void writeHyperparameters(rapidjson::Value& parentObject); + void writeValidationLoss(rapidjson::Value& parentObject); + void writeTimingStats(rapidjson::Value& parentObject); private: SHyperparameters m_Hyperparameters; diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 0f98234b3b..99734f6b04 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -111,6 +111,15 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface virtual SHyperparameters& hyperparameters() = 0; }; +//! \brief Dummies out all instrumentation. +class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final + : public CDataFrameOutliersInstrumentationInterface { +public: + void updateMemoryUsage(std::int64_t) override {} + void updateProgress(double) override {} + void nextStep(std::uint32_t) override {} +}; + //! \brief Dummies out all instrumentation. class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final : public CDataFrameTrainBoostedTreeInstrumentationInterface { diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 250a402937..985c03db83 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -7,6 +7,7 @@ #include #include +#include namespace ml { namespace api { @@ -31,6 +32,27 @@ const std::string VALIDATION_NUM_FOLDS_TAG{"num_folds"}; const std::string TIMING_ELAPSED_TIME_TAG{"elapsed_time"}; const std::string TIMING_ITERATION_TIME_TAG{"iteration_time"}; +// Hyperparameters +const std::string ETA_TAG{"eta"}; +const std::string CLASS_ASSIGNMENT_OBJECTIVE_TAG{"class_assignment_objective"}; +const std::string REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG{"regularization_depth_penalty_multiplier"}; +const std::string REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG{"regularization_soft_tree_depth_limit"}; +const std::string REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG{ + "regularization_soft_tree_depth_tolerance"}; +const std::string REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG{ + "regularization_tree_size_penalty_multiplier"}; +const std::string REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG{ + "regularization_leaf_weight_penalty_multiplier"}; +const std::string DOWNSAMPLE_FACTOR_TAG{"downsample_factor"}; +const std::string NUM_FOLDS_TAG{"num_folds"}; +const std::string MAX_TREES_TAG{"max_trees"}; +const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; +const std::string ETA_GROWTH_RATE_PER_TREE_TAG{"eta_growth_rate_per_tree"}; +const std::string MAX_ATTEMPTS_TO_ADD_TREE_TAG{"max_attempts_to_add_tree"}; +const std::string NUM_SPLITS_PER_FEATURE_TAG{"num_splits_per_feature"}; +const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG{ + "max_optimization_rounds_per_hyperparameter"}; + const std::size_t MAXIMUM_FRACTIONAL_PROGRESS{std::size_t{1} << ((sizeof(std::size_t) - 2) * 8)}; } @@ -86,26 +108,15 @@ void CDataFrameAnalysisInstrumentation::nextStep(std::uint32_t step) { } void CDataFrameAnalysisInstrumentation::writeState(std::uint32_t step) { - // this->writeProgress(step); std::int64_t timestamp{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; this->writeMemory(timestamp); + this->writeAnalysisStats(timestamp, step); } std::int64_t CDataFrameAnalysisInstrumentation::memory() const { return m_Memory.load(); } -void CDataFrameAnalysisInstrumentation::writeProgress(std::uint32_t step) { - if (m_Writer != nullptr) { - m_Writer->StartObject(); - m_Writer->Key(STEP_TAG); - m_Writer->Uint(step); - m_Writer->Key(PROGRESS_TAG); - m_Writer->Double(this->progress()); - m_Writer->EndObject(); - } -} - void CDataFrameAnalysisInstrumentation::writeMemory(std::int64_t timestamp) { if (m_Writer != nullptr) { m_Writer->StartObject(); @@ -159,8 +170,79 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t writer->String(this->jobId()); writer->Key(TIMESTAMP_TAG); writer->Int64(timestamp); + rapidjson::Value hyperparametersObject{writer->makeObject()}; + this->writeHyperparameters(hyperparametersObject); + writer->Key(HYPERPARAMETERS_TAG); + writer->write(hyperparametersObject); writer->EndObject(); } } + +void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) { + if (this->writer() != nullptr) { + + this->writer()->addMember( + ETA_TAG, rapidjson::Value(this->m_Hyperparameters.s_Eta).Move(), parentObject); + // TODO convert from ENUM to String + this->writer()->addMember( + CLASS_ASSIGNMENT_OBJECTIVE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_ClassAssignmentObjective).Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_DepthPenaltyMultiplier) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthLimit) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthTolerance) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_TreeSizePenaltyMultiplier) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_LeafWeightPenaltyMultiplier) + .Move(), + parentObject); + this->writer()->addMember( + DOWNSAMPLE_FACTOR_TAG, + rapidjson::Value(this->m_Hyperparameters.s_DownsampleFactor).Move(), parentObject); + this->writer()->addMember( + NUM_FOLDS_TAG, rapidjson::Value(this->m_Hyperparameters.s_NumFolds).Move(), parentObject); + this->writer()->addMember( + MAX_TREES_TAG, rapidjson::Value(this->m_Hyperparameters.s_MaxTrees).Move(), parentObject); + this->writer()->addMember( + FEATURE_BAG_FRACTION_TAG, + rapidjson::Value(this->m_Hyperparameters.s_FeatureBagFraction).Move(), parentObject); + this->writer()->addMember( + ETA_GROWTH_RATE_PER_TREE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_EtaGrowthRatePerTree).Move(), + parentObject); + this->writer()->addMember( + MAX_ATTEMPTS_TO_ADD_TREE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxAttemptsToAddTree).Move(), + parentObject); + this->writer()->addMember( + NUM_SPLITS_PER_FEATURE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_NumSplitsPerFeature).Move(), parentObject); + this->writer()->addMember(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter) + .Move(), + parentObject); + } +} +void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::Value& /* parentObject */) { +} +void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& /* parentObject */) { +} } } diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 5e030c1d20..37e768c4ad 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -45,4 +45,32 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() <= timeAfter); } +BOOST_AUTO_TEST_CASE(testAnalysisTrainState) { + std::string jobId{"JOB123"}; + std::int64_t memoryUsage{1000}; + std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + std::stringstream outputStream; + { + core::CJsonOutputStreamWrapper streamWrapper(outputStream); + core::CRapidJsonConcurrentLineWriter writer(streamWrapper); + api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); + instrumentation.writer(&writer); + instrumentation.nextStep(0); + outputStream.flush(); + } + std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + LOG_DEBUG(<(ok) == true); + BOOST_TEST_REQUIRE(results.IsArray() == true); + + const auto& result{results[0]}; + BOOST_TEST_REQUIRE(result["job_id"].GetString() == jobId); + BOOST_TEST_REQUIRE(result["type"].GetString() == "analytics_memory_usage"); + BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() >= timeBefore); + BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() <= timeAfter); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc index 7bfaa8a4cc..0ee663fcf7 100644 --- a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc @@ -84,7 +84,7 @@ void addOutlierTestData(TStrVec fieldNames, } frame->finishWritingRows(); - maths::CDataFrameAnalysisInstrumentationStub instrumentation; + maths::CDataFrameOutliersInstrumentationStub instrumentation; maths::COutliers::compute( {1, 1, true, method, numberNeighbours, computeFeatureInfluence, 0.05}, *frame, instrumentation); diff --git a/lib/api/unittest/Makefile b/lib/api/unittest/Makefile index 18e64a2cd4..34bcc8ba4e 100644 --- a/lib/api/unittest/Makefile +++ b/lib/api/unittest/Makefile @@ -26,6 +26,7 @@ SRCS=\ CConfigUpdaterTest.cc \ CCsvInputParserTest.cc \ CCsvOutputWriterTest.cc \ + CDataFrameAnalysisInstrumentationTest.cc \ CDataFrameAnalysisRunnerTest.cc \ CDataFrameAnalysisSpecificationTest.cc \ CDataFrameAnalyzerFeatureImportanceTest.cc \ From c722d7ea79a79220896f4655d5b9dd98ed516f68 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 27 Feb 2020 11:40:27 +0100 Subject: [PATCH 09/40] Add unit test with schema validation --- .../api/CDataFrameAnalysisInstrumentation.h | 39 ++- ...ataFrameAnalysisInstrumentationInterface.h | 16 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 281 ++++++++++++------ .../CDataFrameAnalysisInstrumentationTest.cc | 256 +++++++++++++++- .../analysis_stats.schema.json | 43 +++ .../instrumentation/memory_usage.schema.json | 25 ++ .../outlier_detection_stats.schema.json | 55 ++++ .../supervised_learning_stats.schema.json | 112 +++++++ lib/maths/CBoostedTreeFactory.cc | 2 + lib/maths/CBoostedTreeImpl.cc | 13 +- lib/maths/COutliers.cc | 8 +- 11 files changed, 722 insertions(+), 128 deletions(-) create mode 100644 lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json create mode 100644 lib/api/unittest/testfiles/instrumentation/memory_usage.schema.json create mode 100644 lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json create mode 100644 lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index b5d56d1747..fe1b0b1487 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -14,9 +14,11 @@ #include +#include + #include #include -#include +#include namespace ml { namespace api { @@ -66,7 +68,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation //! Trigger the next step of the job. This will initiate writing the job state //! to the results pipe. - void nextStep(std::uint32_t step) override; + void nextStep(const std::string& phase = "") override; //! \return The peak memory usage. std::int64_t memory() const; @@ -79,8 +81,8 @@ class API_EXPORT CDataFrameAnalysisInstrumentation private: void writeMemory(std::int64_t timestamp); - virtual void writeAnalysisStats(std::int64_t /* timestamp */, std::uint32_t /* step */) {}; - virtual void writeState(std::uint32_t step); + virtual void writeAnalysisStats(std::int64_t /* timestamp */){}; + virtual void writeState(); private: std::atomic_bool m_Finished; @@ -101,7 +103,7 @@ class API_EXPORT CDataFrameOutliersInstrumentation final counter_t::ECounterTypes memoryCounterType() override; private: - void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) override; + void writeAnalysisStats(std::int64_t timestamp) override; }; class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final @@ -111,26 +113,35 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) : CDataFrameAnalysisInstrumentation(jobId){}; - void type(EStatsType /* type */) override{}; - void iteration(std::size_t /* iteration */) override{}; - void startTime(std::uint64_t /* timestamp */) override{}; - void iterationTime(std::uint64_t /* delta */) override{}; - void lossType(const std::string& /* lossType */) override{}; - void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override{}; - void numFolds(std::size_t /* numFolds */) override{}; - void hyperparameters(const SHyperparameters& /* hyperparameters */) override{}; + void type(EStatsType type) override; + void iteration(std::size_t iteration) override; + void iterationTime(std::uint64_t delta) override; + void lossType(const std::string& lossType) override; + void lossValues(std::string fold, TDoubleVec&& lossValues) override; + void numFolds(std::size_t numFolds) override; + void hyperparameters(const SHyperparameters& hyperparameters) override; SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; protected: counter_t::ECounterTypes memoryCounterType() override; private: - void writeAnalysisStats(std::int64_t timestamp, std::uint32_t step) override; + using TLossMap = std::unordered_map; + +private: + void writeAnalysisStats(std::int64_t timestamp) override; void writeHyperparameters(rapidjson::Value& parentObject); void writeValidationLoss(rapidjson::Value& parentObject); void writeTimingStats(rapidjson::Value& parentObject); private: + EStatsType m_Type; + std::size_t m_Iteration; + std::uint64_t m_IterationTime; + std::uint64_t m_ElapsedTime; + std::string m_LossType; + TLossMap m_LossValues; + std::size_t m_NumFolds; SHyperparameters m_Hyperparameters; }; } diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 99734f6b04..8a9d1f796e 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -24,7 +24,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface { public: using TProgressCallback = std::function; using TMemoryUsageCallback = std::function; - using TStepCallback = std::function; + using TStepCallback = std::function; public: virtual ~CDataFrameAnalysisInstrumentationInterface() = default; @@ -41,7 +41,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface { virtual void updateProgress(double fractionalProgress) = 0; //! Trigger the next step of the job. This will initiate writing the job state //! to the results pipe. - virtual void nextStep(std::uint32_t step) = 0; + virtual void nextStep(const std::string& phase = "") = 0; //! Factory for the updateProgress() callback function object. TProgressCallback progressCallback() { return [this](double fractionalProgress) { @@ -54,7 +54,7 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface { } //! Factory for the nextStep() callback function object. TStepCallback stepCallback() { - return [this](std::uint32_t step) { this->nextStep(step); }; + return [this](const std::string& phase) { this->nextStep(phase); }; } }; @@ -102,10 +102,9 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default; virtual void type(EStatsType type) = 0; virtual void iteration(std::size_t iteration) = 0; - virtual void startTime(std::uint64_t timestamp) = 0; virtual void iterationTime(std::uint64_t delta) = 0; virtual void lossType(const std::string& lossType) = 0; - virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0; + virtual void lossValues(std::string fold, TDoubleVec&& lossValues) = 0; virtual void numFolds(std::size_t numFolds) = 0; virtual void hyperparameters(const SHyperparameters& hyperparameters) = 0; virtual SHyperparameters& hyperparameters() = 0; @@ -117,7 +116,7 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final public: void updateMemoryUsage(std::int64_t) override {} void updateProgress(double) override {} - void nextStep(std::uint32_t) override {} + void nextStep(const std::string& /* phase */) override {} }; //! \brief Dummies out all instrumentation. @@ -126,13 +125,12 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final public: void updateMemoryUsage(std::int64_t) override {} void updateProgress(double) override {} - void nextStep(std::uint32_t) override {} + void nextStep(const std::string& /* phase */) override {} void type(EStatsType /* type */) override{}; void iteration(std::size_t /* iteration */) override{}; - void startTime(std::uint64_t /* timestamp */) override{}; void iterationTime(std::uint64_t /* delta */) override{}; void lossType(const std::string& /* lossType */) override{}; - void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override{}; + void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override{}; void numFolds(std::size_t /* numFolds */) override{}; void hyperparameters(const SHyperparameters& /* hyperparameters */) override{}; SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 985c03db83..e97bc6b4c7 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -3,55 +3,60 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ +#include "/usr/local/gcc73/include/boost-1_71/boost/iostreams/filter/zlib.hpp" #include -#include +#include #include + +#include #include +#include namespace ml { namespace api { namespace { -const std::string STEP_TAG{"step"}; -const std::string PROGRESS_TAG{"progress"}; -const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"}; -const std::string TYPE_TAG{"type"}; + +// clang-format off +const std::string ANALYSIS_TYPE_TAG{"analysis_stats"}; +const std::string CLASSIFICATION_STATS_TAG{"classification_stats"}; +const std::string HYPERPARAMETERS_TAG{"hyperparameters"}; +const std::string ITERATION_TAG{"iteration"}; const std::string JOB_ID_TAG{"job_id"}; -const std::string TIMESTAMP_TAG{"timestamp"}; const std::string MEMORY_TYPE_TAG{"analytics_memory_usage"}; -const std::string ANALYSIS_TYPE_TAG{"analysis_stats"}; +const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"}; +const std::string PROGRESS_TAG{"progress"}; const std::string REGRESSION_STATS_TAG{"regression_stats"}; -const std::string ITERATION_TAG{"iteration"}; -const std::string HYPERPARAMETERS_TAG{"hyperparameters"}; -const std::string VALIDATION_LOSS_TAG{"validation_loss"}; +const std::string STEP_TAG{"step"}; +const std::string TIMESTAMP_TAG{"timestamp"}; +const std::string TIMING_ELAPSED_TIME_TAG{"elapsed_time"}; +const std::string TIMING_ITERATION_TIME_TAG{"iteration_time"}; const std::string TIMING_STATS_TAG{"timing_stats"}; +const std::string TYPE_TAG{"type"}; +const std::string VALIDATION_LOSS_TAG{"validation_loss"}; const std::string VALIDATION_LOSS_TYPE_TAG{"loss_type"}; const std::string VALIDATION_LOSS_VALUES_TAG{"values"}; const std::string VALIDATION_NUM_FOLDS_TAG{"num_folds"}; -const std::string TIMING_ELAPSED_TIME_TAG{"elapsed_time"}; -const std::string TIMING_ITERATION_TIME_TAG{"iteration_time"}; // Hyperparameters -const std::string ETA_TAG{"eta"}; const std::string CLASS_ASSIGNMENT_OBJECTIVE_TAG{"class_assignment_objective"}; -const std::string REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG{"regularization_depth_penalty_multiplier"}; -const std::string REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG{"regularization_soft_tree_depth_limit"}; -const std::string REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG{ - "regularization_soft_tree_depth_tolerance"}; -const std::string REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG{ - "regularization_tree_size_penalty_multiplier"}; -const std::string REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG{ - "regularization_leaf_weight_penalty_multiplier"}; +const std::string CLASS_ASSIGNMENT_OBJECTIVE[]{"accuracy", "minimum_recall"}; const std::string DOWNSAMPLE_FACTOR_TAG{"downsample_factor"}; -const std::string NUM_FOLDS_TAG{"num_folds"}; -const std::string MAX_TREES_TAG{"max_trees"}; -const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; const std::string ETA_GROWTH_RATE_PER_TREE_TAG{"eta_growth_rate_per_tree"}; +const std::string ETA_TAG{"eta"}; +const std::string FEATURE_BAG_FRACTION_TAG{"feature_bag_fraction"}; const std::string MAX_ATTEMPTS_TO_ADD_TREE_TAG{"max_attempts_to_add_tree"}; +const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG{"max_optimization_rounds_per_hyperparameter"}; +const std::string MAX_TREES_TAG{"max_trees"}; +const std::string NUM_FOLDS_TAG{"num_folds"}; const std::string NUM_SPLITS_PER_FEATURE_TAG{"num_splits_per_feature"}; -const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG{ - "max_optimization_rounds_per_hyperparameter"}; +const std::string REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG{"regularization_depth_penalty_multiplier"}; +const std::string REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG{"regularization_leaf_weight_penalty_multiplier"}; +const std::string REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG{"regularization_soft_tree_depth_limit"}; +const std::string REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG{"regularization_soft_tree_depth_tolerance"}; +const std::string REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG{"regularization_tree_size_penalty_multiplier"}; +// clang-format on const std::size_t MAXIMUM_FRACTIONAL_PROGRESS{std::size_t{1} << ((sizeof(std::size_t) - 2) * 8)}; @@ -60,7 +65,7 @@ const std::size_t MAXIMUM_FRACTIONAL_PROGRESS{std::size_t{1} void CDataFrameAnalysisInstrumentation::updateMemoryUsage(std::int64_t delta) { std::int64_t memory{m_Memory.fetch_add(delta)}; if (memory >= 0) { - core::CProgramCounters::counter(this->memoryCounterType()).max(memory); + core::CProgramCounters::counter(this->memoryCounterType()).max(static_cast(memory)); } else { // Something has gone wrong with memory estimation. Trap this case // to avoid underflowing the peak memory usage statistic. @@ -103,14 +108,20 @@ void CDataFrameAnalysisInstrumentation::writer(core::CRapidJsonConcurrentLineWri m_Writer = writer; } -void CDataFrameAnalysisInstrumentation::nextStep(std::uint32_t step) { - this->writeState(step); +void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) { + this->writeState(); } -void CDataFrameAnalysisInstrumentation::writeState(std::uint32_t step) { +void CDataFrameAnalysisInstrumentation::writeState() { std::int64_t timestamp{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - this->writeMemory(timestamp); - this->writeAnalysisStats(timestamp, step); + if (m_Writer != nullptr) { + m_Writer->StartObject(); + m_Writer->Key(MEMORY_TYPE_TAG); + this->writeMemory(timestamp); + m_Writer->Key(ANALYSIS_TYPE_TAG); + this->writeAnalysisStats(timestamp); + m_Writer->EndObject(); + } } std::int64_t CDataFrameAnalysisInstrumentation::memory() const { @@ -148,8 +159,7 @@ counter_t::ECounterTypes CDataFrameTrainBoostedTreeInstrumentation::memoryCounte return counter_t::E_DFTPMPeakMemoryUsage; } -void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestamp, - std::uint32_t /*step*/) { +void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestamp) { auto* writer{this->writer()}; if (writer != nullptr) { writer->StartObject(); @@ -161,8 +171,37 @@ void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestam } } -void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp, - std::uint32_t step) { +void CDataFrameTrainBoostedTreeInstrumentation::type(EStatsType type) { + m_Type = type; +} + +void CDataFrameTrainBoostedTreeInstrumentation::iteration(std::size_t iteration) { + m_Iteration = iteration; +} + +void CDataFrameTrainBoostedTreeInstrumentation::iterationTime(std::uint64_t delta) { + m_IterationTime = delta; + m_ElapsedTime += delta; +} + +void CDataFrameTrainBoostedTreeInstrumentation::lossType(const std::string& lossType) { + m_LossType = lossType; +} + +void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::string fold, + TDoubleVec&& lossValues) { + m_LossValues.emplace(std::make_pair(fold, lossValues)); +} + +void CDataFrameTrainBoostedTreeInstrumentation::numFolds(std::size_t numFolds) { + m_NumFolds = numFolds; +} + +void CDataFrameTrainBoostedTreeInstrumentation::hyperparameters(const SHyperparameters& hyperparameters) { + m_Hyperparameters = hyperparameters; +} + +void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { auto* writer{this->writer()}; if (writer != nullptr) { writer->StartObject(); @@ -170,79 +209,129 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t writer->String(this->jobId()); writer->Key(TIMESTAMP_TAG); writer->Int64(timestamp); + if (m_Type == E_Regression) { + writer->Key(REGRESSION_STATS_TAG); + } else { + writer->Key(CLASSIFICATION_STATS_TAG); + } + writer->StartObject(); + writer->Key(ITERATION_TAG); + writer->Uint64(m_Iteration); + rapidjson::Value hyperparametersObject{writer->makeObject()}; this->writeHyperparameters(hyperparametersObject); writer->Key(HYPERPARAMETERS_TAG); writer->write(hyperparametersObject); + + rapidjson::Value validationLossObject{writer->makeObject()}; + this->writeValidationLoss(validationLossObject); + writer->Key(VALIDATION_LOSS_TAG); + writer->write(validationLossObject); + + rapidjson::Value timingStatsObject{writer->makeObject()}; + this->writeTimingStats(timingStatsObject); + writer->Key(TIMING_STATS_TAG); + writer->write(timingStatsObject); + + writer->EndObject(); writer->EndObject(); } } void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) { if (this->writer() != nullptr) { - - this->writer()->addMember( - ETA_TAG, rapidjson::Value(this->m_Hyperparameters.s_Eta).Move(), parentObject); - // TODO convert from ENUM to String - this->writer()->addMember( - CLASS_ASSIGNMENT_OBJECTIVE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_ClassAssignmentObjective).Move(), - parentObject); - this->writer()->addMember( - REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG, - rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_DepthPenaltyMultiplier) - .Move(), - parentObject); - this->writer()->addMember( - REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG, - rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthLimit) - .Move(), - parentObject); - this->writer()->addMember( - REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthTolerance) - .Move(), - parentObject); - this->writer()->addMember( - REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG, - rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_TreeSizePenaltyMultiplier) - .Move(), - parentObject); - this->writer()->addMember( - REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG, - rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_LeafWeightPenaltyMultiplier) - .Move(), - parentObject); - this->writer()->addMember( - DOWNSAMPLE_FACTOR_TAG, - rapidjson::Value(this->m_Hyperparameters.s_DownsampleFactor).Move(), parentObject); - this->writer()->addMember( - NUM_FOLDS_TAG, rapidjson::Value(this->m_Hyperparameters.s_NumFolds).Move(), parentObject); - this->writer()->addMember( - MAX_TREES_TAG, rapidjson::Value(this->m_Hyperparameters.s_MaxTrees).Move(), parentObject); - this->writer()->addMember( - FEATURE_BAG_FRACTION_TAG, - rapidjson::Value(this->m_Hyperparameters.s_FeatureBagFraction).Move(), parentObject); - this->writer()->addMember( - ETA_GROWTH_RATE_PER_TREE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_EtaGrowthRatePerTree).Move(), - parentObject); - this->writer()->addMember( - MAX_ATTEMPTS_TO_ADD_TREE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxAttemptsToAddTree).Move(), - parentObject); - this->writer()->addMember( - NUM_SPLITS_PER_FEATURE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_NumSplitsPerFeature).Move(), parentObject); - this->writer()->addMember(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter) - .Move(), - parentObject); + + this->writer()->addMember( + ETA_TAG, rapidjson::Value(this->m_Hyperparameters.s_Eta).Move(), parentObject); + if (m_Type == E_Classification) { + this->writer()->addMember( + CLASS_ASSIGNMENT_OBJECTIVE_TAG, + CLASS_ASSIGNMENT_OBJECTIVE[this->m_Hyperparameters.s_ClassAssignmentObjective], + parentObject); + } + this->writer()->addMember( + REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_DepthPenaltyMultiplier) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthLimit) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthTolerance) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_TreeSizePenaltyMultiplier) + .Move(), + parentObject); + this->writer()->addMember( + REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_LeafWeightPenaltyMultiplier) + .Move(), + parentObject); + this->writer()->addMember( + DOWNSAMPLE_FACTOR_TAG, + rapidjson::Value(this->m_Hyperparameters.s_DownsampleFactor).Move(), + parentObject); + this->writer()->addMember( + NUM_FOLDS_TAG, + rapidjson::Value(this->m_Hyperparameters.s_NumFolds).Move(), parentObject); + this->writer()->addMember( + MAX_TREES_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxTrees).Move(), parentObject); + this->writer()->addMember( + FEATURE_BAG_FRACTION_TAG, + rapidjson::Value(this->m_Hyperparameters.s_FeatureBagFraction).Move(), + parentObject); + this->writer()->addMember( + ETA_GROWTH_RATE_PER_TREE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_EtaGrowthRatePerTree).Move(), + parentObject); + this->writer()->addMember( + MAX_ATTEMPTS_TO_ADD_TREE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxAttemptsToAddTree).Move(), + parentObject); + this->writer()->addMember( + NUM_SPLITS_PER_FEATURE_TAG, + rapidjson::Value(this->m_Hyperparameters.s_NumSplitsPerFeature).Move(), + parentObject); + this->writer()->addMember(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter) + .Move(), + parentObject); } } -void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::Value& /* parentObject */) { +void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::Value& parentObject) { + auto* writer{this->writer()}; + if (writer != nullptr) { + writer->addMember(VALIDATION_LOSS_TYPE_TAG, m_LossType, parentObject); + rapidjson::Value lossValuesObject{writer->makeObject()}; + // writer->StartObject(); + for (auto& element : m_LossValues) { + rapidjson::Value array{writer->makeArray(element.second.size())}; + for (double lossValue : element.second) { + array.PushBack(rapidjson::Value(lossValue).Move(), + writer->getRawAllocator()); + } + writer->addMember(element.first, array, lossValuesObject); + } + writer->addMember(VALIDATION_LOSS_VALUES_TAG, lossValuesObject, parentObject); + // writer->EndObject(); + } } -void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& /* parentObject */) { +void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& parentObject) { + auto* writer{this->writer()}; + if (writer != nullptr) { + writer->addMember(TIMING_ELAPSED_TIME_TAG, + rapidjson::Value(m_ElapsedTime).Move(), parentObject); + writer->addMember(TIMING_ITERATION_TIME_TAG, + rapidjson::Value(m_IterationTime).Move(), parentObject); + } } } } diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 37e768c4ad..d127ff24e7 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -4,18 +4,222 @@ * you may not use this file except in compliance with the Elastic License. */ +#include +#include +#include #include +#include +#include + #include +#include + +#include +#include +#include + +#include #include +#include +#include #include BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) using namespace ml; +namespace { + +enum EPredictionType { E_Regression, E_BinaryClassification }; +using TStrVec = std::vector; +using TDoubleVec = std::vector; +using TDataFrameUPtr = std::unique_ptr; +using TBoolVec = std::vector; +using TRowItr = core::CDataFrame::TRowItr; + +void appendPrediction(core::CDataFrame&, std::size_t, double prediction, double, TDoubleVec& predictions) { + predictions.push_back(prediction); +} + +void appendPrediction(core::CDataFrame& frame, + std::size_t columnHoldingPrediction, + double logOddsClass1, + double threshold, + TStrVec& predictions) { + predictions.push_back( + maths::CTools::logisticFunction(logOddsClass1) < threshold + ? frame.categoricalColumnValues()[columnHoldingPrediction][0] + : frame.categoricalColumnValues()[columnHoldingPrediction][1]); +} + +TDataFrameUPtr setupLinearRegressionData(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& regressors, + TStrVec& targets) { + + auto target = [&weights](const TDoubleVec& regressors_) { + double result{0.0}; + for (std::size_t i = 0; i < weights.size(); ++i) { + result += weights[i] * regressors_[i]; + } + return core::CStringUtils::typeToStringPrecise(result, core::CIEEE754::E_DoublePrecision); + }; + + auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; + + for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { + TDoubleVec row(weights.size()); + for (std::size_t j = 0; j < weights.size(); ++j) { + row[j] = regressors[i + j]; + } + + for (std::size_t j = 0; j < row.size(); ++j) { + fieldValues[j] = core::CStringUtils::typeToStringPrecise( + row[j], core::CIEEE754::E_DoublePrecision); + } + fieldValues[weights.size()] = target(row); + targets.push_back(fieldValues[weights.size()]); + + analyzer.handleRecord(fieldNames, fieldValues); + frame->parseAndWriteRow( + core::CVectorRange(fieldValues, 0, weights.size() + 1)); + } + + frame->finishWritingRows(); + + return frame; +} + +TDataFrameUPtr setupBinaryClassificationData(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& regressors, + TStrVec& targets) { + TStrVec classes{"foo", "bar"}; + auto target = [&weights, &classes](const TDoubleVec& regressors_) { + double result{0.0}; + for (std::size_t i = 0; i < weights.size(); ++i) { + result += weights[i] * regressors_[i]; + } + return classes[result < 0.0 ? 0 : 1]; + }; + + auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; + TBoolVec categoricalFields(weights.size(), false); + categoricalFields.push_back(true); + frame->categoricalColumns(std::move(categoricalFields)); + + for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { + TDoubleVec row(weights.size()); + for (std::size_t j = 0; j < weights.size(); ++j) { + row[j] = regressors[i + j]; + } + + for (std::size_t j = 0; j < row.size() - 1; ++j) { + fieldValues[j] = core::CStringUtils::typeToStringPrecise( + row[j], core::CIEEE754::E_DoublePrecision); + } + fieldValues[weights.size()] = target(row); + targets.push_back(fieldValues[weights.size()]); + + analyzer.handleRecord(fieldNames, fieldValues); + frame->parseAndWriteRow( + core::CVectorRange(fieldValues, 0, weights.size() + 1)); + } + + frame->finishWritingRows(); + + return frame; +} + +template +void addPredictionTestData(EPredictionType type, + const TStrVec& fieldNames, + TStrVec fieldValues, + api::CDataFrameAnalyzer& analyzer, + std::vector& expectedPredictions, + std::size_t numberExamples = 100, + double alpha = -1.0, + double lambda = -1.0, + double gamma = -1.0, + double softTreeDepthLimit = -1.0, + double softTreeDepthTolerance = -1.0, + double eta = 0.0, + std::size_t maximumNumberTrees = 0, + double featureBagFraction = 0.0) { + + test::CRandomNumbers rng; + + TDoubleVec weights; + rng.generateUniformSamples(-1.0, 1.0, fieldNames.size() - 3, weights); + TDoubleVec regressors; + rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, regressors); + + TStrVec targets; + auto frame = type == E_Regression + ? setupLinearRegressionData(fieldNames, fieldValues, analyzer, + weights, regressors, targets) + : setupBinaryClassificationData(fieldNames, fieldValues, analyzer, + weights, regressors, targets); + + std::unique_ptr loss; + if (type == E_Regression) { + loss = std::make_unique(); + } else { + loss = std::make_unique(); + } + + maths::CBoostedTreeFactory treeFactory{ + maths::CBoostedTreeFactory::constructFromParameters(1, std::move(loss))}; + if (alpha >= 0.0) { + treeFactory.depthPenaltyMultiplier(alpha); + } + if (lambda >= 0.0) { + treeFactory.leafWeightPenaltyMultiplier(lambda); + } + if (gamma >= 0.0) { + treeFactory.treeSizePenaltyMultiplier(gamma); + } + if (softTreeDepthLimit >= 0.0) { + treeFactory.softTreeDepthLimit(softTreeDepthLimit); + } + if (softTreeDepthTolerance >= 0.0) { + treeFactory.softTreeDepthTolerance(softTreeDepthTolerance); + } + if (eta > 0.0) { + treeFactory.eta(eta); + } + if (maximumNumberTrees > 0) { + treeFactory.maximumNumberTrees(maximumNumberTrees); + } + if (featureBagFraction > 0.0) { + treeFactory.featureBagFraction(featureBagFraction); + } + + ml::api::CDataFrameTrainBoostedTreeInstrumentation instrumentation("testJob"); + treeFactory.analysisInstrumentation(instrumentation); + + auto tree = treeFactory.buildFor(*frame, weights.size()); + + tree->train(); + tree->predict(); + + frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + double prediction{(*row)[tree->columnHoldingPrediction()]}; + appendPrediction(*frame, weights.size(), prediction, + tree->probabilityAtWhichToAssignClassOne(), expectedPredictions); + } + }); +} +} + BOOST_AUTO_TEST_CASE(testMemoryState) { std::string jobId{"JOB123"}; std::int64_t memoryUsage{1000}; @@ -59,7 +263,7 @@ BOOST_AUTO_TEST_CASE(testAnalysisTrainState) { outputStream.flush(); } std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - LOG_DEBUG(<(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + api::CDataFrameAnalyzer analyzer{ + test::CDataFrameAnalysisSpecificationFactory::predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), + outputWriterFactory}; + addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + LOG_DEBUG(<< output.str()); + + std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); + std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document schemaDocument; + BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument schema(schemaDocument); + rapidjson::SchemaValidator validator(schema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analysis_stats")) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); + if (result["analysis_stats"]["regression_stats"].Accept(validator) == false) { + rapidjson::StringBuffer sb; + validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); + sb.Clear(); + validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json new file mode 100644 index 0000000000..029bb8e93e --- /dev/null +++ b/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json @@ -0,0 +1,43 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/analysis_stats.schema.json", + "title": "analysis_stats", + "type": "object", + "properties": { + "job_id": { + "type": "string", + "description": "Data Frame Analytics Job ID. Populated by Java." + }, + "timestamp": { + "type": "integer", + "description": "Milliseconds since Unix Epoch" + }, + "regression_stats": { + "$ref": "supervised_learning_stats.schema.json" + }, + "classification_stats": { + "$ref": "supervised_learning_stats.schema.json" + }, + "outlier_detection_stats": { + "$ref": "outlier_detection_stats.schema.json" + } + }, + "oneOf": [ + { + "required": [ + "regression_stats" + ] + }, + { + "required": [ + "classification_stats" + ] + }, + { + "required": [ + "outlier_detection_stats" + ] + } + ], + "additionalProperties": false +} \ No newline at end of file diff --git a/lib/api/unittest/testfiles/instrumentation/memory_usage.schema.json b/lib/api/unittest/testfiles/instrumentation/memory_usage.schema.json new file mode 100644 index 0000000000..9ef435ab81 --- /dev/null +++ b/lib/api/unittest/testfiles/instrumentation/memory_usage.schema.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/memory_usage.schema.json", + "description": "Data frame analytics peak memory usage", + "title": "analytics_memory_usage", + "type": "object", + "properties": { + "job_id": { + "description": "Data Frame Analytics Job ID. Populated by Java.", + "type": "string" + }, + "timestamp": { + "description": "Milliseconds since Unix Epoch", + "type": "integer" + }, + "peak_usage_bytes": { + "description": "Peak memory usage for the data frame analytics job in bytes", + "type": "integer" + } + }, + "required": [ + "peak_usage_bytes" + ], + "additionalProperties": false +} \ No newline at end of file diff --git a/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json new file mode 100644 index 0000000000..3bcab8c632 --- /dev/null +++ b/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json @@ -0,0 +1,55 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/outlier_detection_stats.schema.json", + "title": "outlier_detection_stats", + "description": "Instrumentation data specific to the outlier detection jobs.", + "type": "object", + "properties": { + "parameters": { + "type": "object", + "description": "List of job parameters specified by user or determined by algorithmic heuristics", + "properties": { + "n_neighbours": { + "description": "Defines the value for how many nearest neighbors each method of outlier detection will use to calculate its outlier score.", + "type": "integer" + }, + "methods": { + "description": "List of methods that outlier detection uses.", + "type": "array", + "items": [ + { + "type": "string" + } + ], + "uniqueItems": true + }, + "compute_feature_influence": { + "description": "If true, the feature influence calculation is enabled.", + "type": "boolean" + }, + "feature_influence_threshold": { + "description": "The minimum outlier score that a document needs to have in order to calculate its feature influence score.", + "type": "number" + }, + "outlier_fraction": { + "description": "The proportion of the data set that is assumed to be outlying prior to outlier detection.", + "type": "number" + }, + "standardization_enabled": { + "description": "If true, then the following operation is performed on the columns before computing outlier scores: (x_i - mean(x_i)) / sd(x_i).", + "type": "boolean" + } + }, + "additionalProperties": false + }, + "elapsed_time": { + "description": "Job runtime so far in ms.", + "type": "number" + } + }, + "required": [ + "parameters", + "elapsed_time" + ], + "additionalProperties": false +} diff --git a/lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json new file mode 100644 index 0000000000..53b8383bb8 --- /dev/null +++ b/lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json @@ -0,0 +1,112 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/supervised_learning_stats.schema.json", + "description": "Instrumentation data specific to the supervised learning jobs.", + "title": "supervised_learning_stats", + "definitions": { + "loss_values": { + "type": "array", + "items": { + "type": "number" + } + } + }, + "type": "object", + "properties": { + "iteration": { + "type": "integer" + }, + "hyperparameters": { + "type": "object", + "properties": { + "eta": { + "type": "number" + }, + "class_assignment_objective": { + "type": "string", + "enum": ["accuracy", "minimum_recall"] + }, + "regularization_depth_penalty_multiplier": { + "type": "number" + }, + "regularization_soft_tree_depth_limit": { + "type": "number" + }, + "regularization_soft_tree_depth_tolerance": { + "type": "number" + }, + "regularization_tree_size_penalty_multiplier": { + "type": "number" + }, + "regularization_leaf_weight_penalty_multiplier": { + "type": "number" + }, + "downsample_factor": { + "type": "number" + }, + "num_folds": { + "type": "integer" + }, + "max_trees": { + "type": "integer" + }, + "feature_bag_fraction": { + "type": "number" + }, + "eta_growth_rate_per_tree": { + "type": "number" + }, + "max_attempts_to_add_tree": { + "type": "integer" + }, + "num_splits_per_feature": { + "type": "integer" + }, + "max_optimization_rounds_per_hyperparameter": { + "type": "integer" + } + } + }, + "validation_loss": { + "type": "object", + "properties": { + "loss_type": { + "description": "Loss metric name", + "type": "string", + "enum": ["mse", "binomial_logistic"] + + }, + "values": { + "description": "Validation loss values for every added decision tree during forest growing procedure", + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/loss_values" + } + } + }, + "additionalProperties": false, + "required": ["loss_type", "values"] + }, + "timing_stats": { + "type": "object", + "properties": { + "elapsed_time": { + "description": "Job runtime so far in ms.", + "type": "integer" + }, + "iteration_time": { + "description": "Runtime of the last iteration in ms.", + "type": "integer" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false, + "required": [ + "iteration", + "hyperparameters", + "validation_loss", + "timing_stats" + ] +} diff --git a/lib/maths/CBoostedTreeFactory.cc b/lib/maths/CBoostedTreeFactory.cc index a5b1974172..1662fc788b 100644 --- a/lib/maths/CBoostedTreeFactory.cc +++ b/lib/maths/CBoostedTreeFactory.cc @@ -98,6 +98,7 @@ CBoostedTreeFactory::buildFor(core::CDataFrame& frame, std::size_t dependentVari this->selectFeaturesAndEncodeCategories(frame); this->determineFeatureDataTypes(frame); m_TreeImpl->m_Instrumentation->updateMemoryUsage(core::CMemory::dynamicSize(m_TreeImpl)); + m_TreeImpl->m_Instrumentation->lossType(m_TreeImpl->m_Loss->name()); if (this->initializeFeatureSampleDistribution()) { this->initializeHyperparameters(frame); @@ -123,6 +124,7 @@ CBoostedTreeFactory::restoreFor(core::CDataFrame& frame, std::size_t dependentVa this->resumeRestoredTrainingProgressMonitoring(); this->resizeDataFrame(frame); m_TreeImpl->m_Instrumentation->updateMemoryUsage(core::CMemory::dynamicSize(m_TreeImpl)); + m_TreeImpl->m_Instrumentation->lossType(m_TreeImpl->m_Loss->name()); return TBoostedTreeUPtr{ new CBoostedTree{frame, m_RecordTrainingState, std::move(m_TreeImpl)}}; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 06f9db22da..9ce216c164 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -23,6 +23,7 @@ #include #include #include +#include namespace ml { namespace maths { @@ -40,6 +41,9 @@ namespace { // by only refreshing once every MINIMUM_SPLIT_REFRESH_INTERVAL trees we add. const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; +const std::string HYPERPARAMETER_OPTIMIZATION_PHASE{"hyperparameter_optimization"}; +const std::string TRAINING_FINAL_TREE_PHASE{"training_final_tree"}; + //! \brief Record the memory used by a supplied object using the RAII idiom. class CScopeRecordMemoryUsage { public: @@ -186,7 +190,6 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, core::CStopWatch stopWatch; stopWatch.start(); std::uint64_t lastLap{stopWatch.lap()}; - m_Instrumentation->startTime(lastLap); // Hyperparameter optimisation loop. @@ -226,7 +229,7 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, timeAccumulator.add(static_cast(delta)); lastLap = currentLap; - m_Instrumentation->nextStep(static_cast(m_CurrentRound)); + m_Instrumentation->nextStep(HYPERPARAMETER_OPTIMIZATION_PHASE); } LOG_TRACE(<< "Test loss = " << m_BestForestTestLoss); @@ -234,8 +237,8 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, this->restoreBestHyperparameters(); std::tie(m_BestForest, std::ignore, std::ignore) = this->trainForest( frame, allTrainingRowsMask, allTrainingRowsMask, m_TrainingProgress); - - m_Instrumentation->nextStep(static_cast(m_CurrentRound)); + m_Instrumentation->iteration(m_CurrentRound); + m_Instrumentation->nextStep(TRAINING_FINAL_TREE_PHASE); this->recordState(recordTrainStateCallback); timeAccumulator.add(static_cast(stopWatch.stop())); @@ -465,7 +468,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { lossMoments.add(loss); m_FoldRoundTestLosses[fold][m_CurrentRound] = loss; numberTrees.push_back(static_cast(forest.size())); - m_Instrumentation->lossValues(fold, std::move(lossValues)); + m_Instrumentation->lossValues(std::to_string(fold), std::move(lossValues)); } m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size()); LOG_TRACE(<< "skipped " << folds.size() << " folds"); diff --git a/lib/maths/COutliers.cc b/lib/maths/COutliers.cc index 825b2cc687..a654b263da 100644 --- a/lib/maths/COutliers.cc +++ b/lib/maths/COutliers.cc @@ -28,8 +28,11 @@ namespace maths { using namespace outliers_detail; namespace { + +const std::string COMPUTE_OUTLIER_SCORES{"compute_outlier_scores"}; + using TRowItr = core::CDataFrame::TRowItr; -using TStepCallback = std::function; +using TStepCallback = std::function; double shift(double score) { return std::exp(-2.0) + score; @@ -377,10 +380,9 @@ CEnsemble::computeOutlierScores(const std::vector& points) const { TScorerVec scores(points.size()); m_RecordMemoryUsage(core::CMemory::dynamicSize(scores)); - std::uint32_t step{0}; for (const auto& model : m_Models) { model.addOutlierScores(points, scores, m_RecordMemoryUsage); - m_RecordStep(step++); + m_RecordStep(COMPUTE_OUTLIER_SCORES); } return scores; } From 513223c86dd3412ca3b7e8840e9b033ef2acf884 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 27 Feb 2020 12:16:01 +0100 Subject: [PATCH 10/40] Classification unit test --- .../CDataFrameAnalysisInstrumentationTest.cc | 55 ++++++++++++++++++- lib/maths/CBoostedTreeImpl.cc | 10 ++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index d127ff24e7..27292e2431 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -221,7 +221,7 @@ void addPredictionTestData(EPredictionType type, } BOOST_AUTO_TEST_CASE(testMemoryState) { - std::string jobId{"JOB123"}; + std::string jobId{"testJob"}; std::int64_t memoryUsage{1000}; std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; std::stringstream outpustStream; @@ -250,7 +250,7 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { } BOOST_AUTO_TEST_CASE(testAnalysisTrainState) { - std::string jobId{"JOB123"}; + std::string jobId{"testJob"}; std::int64_t memoryUsage{1000}; std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; std::stringstream outputStream; @@ -295,6 +295,57 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + + std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); + std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document schemaDocument; + BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument schema(schemaDocument); + rapidjson::SchemaValidator validator(schema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analysis_stats")) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); + if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { + rapidjson::StringBuffer sb; + validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); + sb.Clear(); + validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } +} + +BOOST_AUTO_TEST_CASE(testTrainingClassification) { + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + api::CDataFrameAnalyzer analyzer{ + test::CDataFrameAnalysisSpecificationFactory::predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::classification(), + "target", 100, 5, 6000000, 0, 0, {"target"}), + outputWriterFactory}; + addPredictionTestData(E_BinaryClassification, fieldNames, fieldValues, + analyzer, expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + rapidjson::Document results; rapidjson::ParseResult ok(results.Parse(output.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 9ce216c164..415df828c5 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -4,6 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ +#include "maths/CBoostedTree.h" +#include "maths/CDataFrameAnalysisInstrumentationInterface.h" #include #include @@ -44,6 +46,8 @@ const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; const std::string HYPERPARAMETER_OPTIMIZATION_PHASE{"hyperparameter_optimization"}; const std::string TRAINING_FINAL_TREE_PHASE{"training_final_tree"}; +const std::array REGRESSION_LOSSES{CMse::NAME}; + //! \brief Record the memory used by a supplied object using the RAII idiom. class CScopeRecordMemoryUsage { public: @@ -147,6 +151,12 @@ CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, m_Regularization, m_DownsampleFactor, m_Eta, m_EtaGrowthRatePerTree, m_MaximumNumberTrees, m_FeatureBagFraction}, m_Instrumentation{instrumentation != nullptr ? instrumentation : &INSTRUMENTATION_STUB} { + if (std::find(REGRESSION_LOSSES.begin(), REGRESSION_LOSSES.end(), + m_Loss->name()) != REGRESSION_LOSSES.end()) { + m_Instrumentation->type(CDataFrameTrainBoostedTreeInstrumentationInterface::E_Regression); + } else { + m_Instrumentation->type(CDataFrameTrainBoostedTreeInstrumentationInterface::E_Classification); + } } CBoostedTreeImpl::CBoostedTreeImpl() = default; From b62499630becce817bb7e56cc71a6902dfb5eb63 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 27 Feb 2020 15:05:11 +0100 Subject: [PATCH 11/40] extract training setup bolier plate code --- .../test/CDataFrameAnalyzerTrainingFactory.h | 148 ++++++++++++ lib/api/CDataFrameAnalysisInstrumentation.cc | 2 - .../CDataFrameAnalysisInstrumentationTest.cc | 204 +--------------- .../CDataFrameAnalyzerTrainingTest.cc | 218 ++---------------- lib/maths/CBoostedTreeImpl.cc | 13 +- lib/test/CDataFrameAnalyzerTrainingFactory.cc | 116 ++++++++++ lib/test/Makefile | 1 + 7 files changed, 300 insertions(+), 402 deletions(-) create mode 100644 include/test/CDataFrameAnalyzerTrainingFactory.h create mode 100644 lib/test/CDataFrameAnalyzerTrainingFactory.cc diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h new file mode 100644 index 0000000000..d70bf76c65 --- /dev/null +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -0,0 +1,148 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h +#define INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h + +#include +#include + +#include + +#include +#include + +#include + +#include + +#include +#include + +namespace ml { +namespace test { + +class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { +public: + enum EPredictionType { E_Regression, E_BinaryClassification }; + using TStrVec = std::vector; + using TDoubleVec = std::vector; + using TDataFrameUPtr = std::unique_ptr; + +public: + template + static void addPredictionTestData(EPredictionType type, + const TStrVec& fieldNames, + TStrVec fieldValues, + api::CDataFrameAnalyzer& analyzer, + std::vector& expectedPredictions, + std::size_t numberExamples = 100, + double alpha = -1.0, + double lambda = -1.0, + double gamma = -1.0, + double softTreeDepthLimit = -1.0, + double softTreeDepthTolerance = -1.0, + double eta = 0.0, + std::size_t maximumNumberTrees = 0, + double featureBagFraction = 0.0) { + + test::CRandomNumbers rng; + + TDoubleVec weights; + rng.generateUniformSamples(-1.0, 1.0, fieldNames.size() - 3, weights); + TDoubleVec regressors; + rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, regressors); + + TStrVec targets; + auto frame = type == E_Regression + ? setupLinearRegressionData(fieldNames, fieldValues, analyzer, + weights, regressors, targets) + : setupBinaryClassificationData(fieldNames, fieldValues, analyzer, + weights, regressors, targets); + + std::unique_ptr loss; + if (type == E_Regression) { + loss = std::make_unique(); + } else { + loss = std::make_unique(); + } + + maths::CBoostedTreeFactory treeFactory{ + maths::CBoostedTreeFactory::constructFromParameters(1, std::move(loss))}; + if (alpha >= 0.0) { + treeFactory.depthPenaltyMultiplier(alpha); + } + if (lambda >= 0.0) { + treeFactory.leafWeightPenaltyMultiplier(lambda); + } + if (gamma >= 0.0) { + treeFactory.treeSizePenaltyMultiplier(gamma); + } + if (softTreeDepthLimit >= 0.0) { + treeFactory.softTreeDepthLimit(softTreeDepthLimit); + } + if (softTreeDepthTolerance >= 0.0) { + treeFactory.softTreeDepthTolerance(softTreeDepthTolerance); + } + if (eta > 0.0) { + treeFactory.eta(eta); + } + if (maximumNumberTrees > 0) { + treeFactory.maximumNumberTrees(maximumNumberTrees); + } + if (featureBagFraction > 0.0) { + treeFactory.featureBagFraction(featureBagFraction); + } + + ml::api::CDataFrameTrainBoostedTreeInstrumentation instrumentation("testJob"); + treeFactory.analysisInstrumentation(instrumentation); + + auto tree = treeFactory.buildFor(*frame, weights.size()); + + tree->train(); + tree->predict(); + + frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + for (auto row = beginRows; row != endRows; ++row) { + double prediction{(*row)[tree->columnHoldingPrediction()]}; + appendPrediction(*frame, weights.size(), prediction, + tree->probabilityAtWhichToAssignClassOne(), + expectedPredictions); + } + }); + } + + static TDataFrameUPtr setupBinaryClassificationData(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& regressors, + TStrVec& targets); + static TDataFrameUPtr setupLinearRegressionData(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& regressors, + TStrVec& targets); + +private: + using TBoolVec = std::vector; + using TRowItr = core::CDataFrame::TRowItr; + +private: + static void + appendPrediction(core::CDataFrame&, std::size_t, double prediction, double, TDoubleVec& predictions); + + static void appendPrediction(core::CDataFrame& frame, + std::size_t columnHoldingPrediction, + double logOddsClass1, + double threshold, + TStrVec& predictions); +}; +} +} + +#endif // INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index e97bc6b4c7..a1952b2ebc 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -3,10 +3,8 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -#include "/usr/local/gcc73/include/boost-1_71/boost/iostreams/filter/zlib.hpp" #include -#include #include #include diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 27292e2431..74040c33b5 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -3,21 +3,13 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ - -#include -#include -#include #include -#include -#include - #include -#include #include #include -#include +#include #include @@ -32,192 +24,8 @@ BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) using namespace ml; namespace { - -enum EPredictionType { E_Regression, E_BinaryClassification }; using TStrVec = std::vector; using TDoubleVec = std::vector; -using TDataFrameUPtr = std::unique_ptr; -using TBoolVec = std::vector; -using TRowItr = core::CDataFrame::TRowItr; - -void appendPrediction(core::CDataFrame&, std::size_t, double prediction, double, TDoubleVec& predictions) { - predictions.push_back(prediction); -} - -void appendPrediction(core::CDataFrame& frame, - std::size_t columnHoldingPrediction, - double logOddsClass1, - double threshold, - TStrVec& predictions) { - predictions.push_back( - maths::CTools::logisticFunction(logOddsClass1) < threshold - ? frame.categoricalColumnValues()[columnHoldingPrediction][0] - : frame.categoricalColumnValues()[columnHoldingPrediction][1]); -} - -TDataFrameUPtr setupLinearRegressionData(const TStrVec& fieldNames, - TStrVec& fieldValues, - api::CDataFrameAnalyzer& analyzer, - const TDoubleVec& weights, - const TDoubleVec& regressors, - TStrVec& targets) { - - auto target = [&weights](const TDoubleVec& regressors_) { - double result{0.0}; - for (std::size_t i = 0; i < weights.size(); ++i) { - result += weights[i] * regressors_[i]; - } - return core::CStringUtils::typeToStringPrecise(result, core::CIEEE754::E_DoublePrecision); - }; - - auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; - - for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { - TDoubleVec row(weights.size()); - for (std::size_t j = 0; j < weights.size(); ++j) { - row[j] = regressors[i + j]; - } - - for (std::size_t j = 0; j < row.size(); ++j) { - fieldValues[j] = core::CStringUtils::typeToStringPrecise( - row[j], core::CIEEE754::E_DoublePrecision); - } - fieldValues[weights.size()] = target(row); - targets.push_back(fieldValues[weights.size()]); - - analyzer.handleRecord(fieldNames, fieldValues); - frame->parseAndWriteRow( - core::CVectorRange(fieldValues, 0, weights.size() + 1)); - } - - frame->finishWritingRows(); - - return frame; -} - -TDataFrameUPtr setupBinaryClassificationData(const TStrVec& fieldNames, - TStrVec& fieldValues, - api::CDataFrameAnalyzer& analyzer, - const TDoubleVec& weights, - const TDoubleVec& regressors, - TStrVec& targets) { - TStrVec classes{"foo", "bar"}; - auto target = [&weights, &classes](const TDoubleVec& regressors_) { - double result{0.0}; - for (std::size_t i = 0; i < weights.size(); ++i) { - result += weights[i] * regressors_[i]; - } - return classes[result < 0.0 ? 0 : 1]; - }; - - auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; - TBoolVec categoricalFields(weights.size(), false); - categoricalFields.push_back(true); - frame->categoricalColumns(std::move(categoricalFields)); - - for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { - TDoubleVec row(weights.size()); - for (std::size_t j = 0; j < weights.size(); ++j) { - row[j] = regressors[i + j]; - } - - for (std::size_t j = 0; j < row.size() - 1; ++j) { - fieldValues[j] = core::CStringUtils::typeToStringPrecise( - row[j], core::CIEEE754::E_DoublePrecision); - } - fieldValues[weights.size()] = target(row); - targets.push_back(fieldValues[weights.size()]); - - analyzer.handleRecord(fieldNames, fieldValues); - frame->parseAndWriteRow( - core::CVectorRange(fieldValues, 0, weights.size() + 1)); - } - - frame->finishWritingRows(); - - return frame; -} - -template -void addPredictionTestData(EPredictionType type, - const TStrVec& fieldNames, - TStrVec fieldValues, - api::CDataFrameAnalyzer& analyzer, - std::vector& expectedPredictions, - std::size_t numberExamples = 100, - double alpha = -1.0, - double lambda = -1.0, - double gamma = -1.0, - double softTreeDepthLimit = -1.0, - double softTreeDepthTolerance = -1.0, - double eta = 0.0, - std::size_t maximumNumberTrees = 0, - double featureBagFraction = 0.0) { - - test::CRandomNumbers rng; - - TDoubleVec weights; - rng.generateUniformSamples(-1.0, 1.0, fieldNames.size() - 3, weights); - TDoubleVec regressors; - rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, regressors); - - TStrVec targets; - auto frame = type == E_Regression - ? setupLinearRegressionData(fieldNames, fieldValues, analyzer, - weights, regressors, targets) - : setupBinaryClassificationData(fieldNames, fieldValues, analyzer, - weights, regressors, targets); - - std::unique_ptr loss; - if (type == E_Regression) { - loss = std::make_unique(); - } else { - loss = std::make_unique(); - } - - maths::CBoostedTreeFactory treeFactory{ - maths::CBoostedTreeFactory::constructFromParameters(1, std::move(loss))}; - if (alpha >= 0.0) { - treeFactory.depthPenaltyMultiplier(alpha); - } - if (lambda >= 0.0) { - treeFactory.leafWeightPenaltyMultiplier(lambda); - } - if (gamma >= 0.0) { - treeFactory.treeSizePenaltyMultiplier(gamma); - } - if (softTreeDepthLimit >= 0.0) { - treeFactory.softTreeDepthLimit(softTreeDepthLimit); - } - if (softTreeDepthTolerance >= 0.0) { - treeFactory.softTreeDepthTolerance(softTreeDepthTolerance); - } - if (eta > 0.0) { - treeFactory.eta(eta); - } - if (maximumNumberTrees > 0) { - treeFactory.maximumNumberTrees(maximumNumberTrees); - } - if (featureBagFraction > 0.0) { - treeFactory.featureBagFraction(featureBagFraction); - } - - ml::api::CDataFrameTrainBoostedTreeInstrumentation instrumentation("testJob"); - treeFactory.analysisInstrumentation(instrumentation); - - auto tree = treeFactory.buildFor(*frame, weights.size()); - - tree->train(); - tree->predict(); - - frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { - for (auto row = beginRows; row != endRows; ++row) { - double prediction{(*row)[tree->columnHoldingPrediction()]}; - appendPrediction(*frame, weights.size(), prediction, - tree->probabilityAtWhichToAssignClassOne(), expectedPredictions); - } - }); -} } BOOST_AUTO_TEST_CASE(testMemoryState) { @@ -251,7 +59,6 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { BOOST_AUTO_TEST_CASE(testAnalysisTrainState) { std::string jobId{"testJob"}; - std::int64_t memoryUsage{1000}; std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; std::stringstream outputStream; { @@ -291,7 +98,9 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; - addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, expectedPredictions); + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, + fieldValues, analyzer, expectedPredictions); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); @@ -341,8 +150,9 @@ BOOST_AUTO_TEST_CASE(testTrainingClassification) { test::CDataFrameAnalysisSpecificationFactory::classification(), "target", 100, 5, 6000000, 0, 0, {"target"}), outputWriterFactory}; - addPredictionTestData(E_BinaryClassification, fieldNames, fieldValues, - analyzer, expectedPredictions); + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, + fieldNames, fieldValues, analyzer, expectedPredictions); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); diff --git a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc index c2752a53a7..2ecbcfd1aa 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTrainingTest.cc @@ -20,11 +20,11 @@ #include #include +#include #include +#include #include -#include - #include #include @@ -101,187 +101,6 @@ auto restoreTree(std::string persistedState, TDataFrameUPtr& frame, std::size_t *frame, dependentVariable); } -TDataFrameUPtr setupLinearRegressionData(const TStrVec& fieldNames, - TStrVec& fieldValues, - api::CDataFrameAnalyzer& analyzer, - const TDoubleVec& weights, - const TDoubleVec& regressors, - TStrVec& targets) { - - auto target = [&weights](const TDoubleVec& regressors_) { - double result{0.0}; - for (std::size_t i = 0; i < weights.size(); ++i) { - result += weights[i] * regressors_[i]; - } - return core::CStringUtils::typeToStringPrecise(result, core::CIEEE754::E_DoublePrecision); - }; - - auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; - - for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { - TDoubleVec row(weights.size()); - for (std::size_t j = 0; j < weights.size(); ++j) { - row[j] = regressors[i + j]; - } - - for (std::size_t j = 0; j < row.size(); ++j) { - fieldValues[j] = core::CStringUtils::typeToStringPrecise( - row[j], core::CIEEE754::E_DoublePrecision); - } - fieldValues[weights.size()] = target(row); - targets.push_back(fieldValues[weights.size()]); - - analyzer.handleRecord(fieldNames, fieldValues); - frame->parseAndWriteRow( - core::CVectorRange(fieldValues, 0, weights.size() + 1)); - } - - frame->finishWritingRows(); - - return frame; -} - -TDataFrameUPtr setupBinaryClassificationData(const TStrVec& fieldNames, - TStrVec& fieldValues, - api::CDataFrameAnalyzer& analyzer, - const TDoubleVec& weights, - const TDoubleVec& regressors, - TStrVec& targets) { - TStrVec classes{"foo", "bar"}; - auto target = [&weights, &classes](const TDoubleVec& regressors_) { - double result{0.0}; - for (std::size_t i = 0; i < weights.size(); ++i) { - result += weights[i] * regressors_[i]; - } - return classes[result < 0.0 ? 0 : 1]; - }; - - auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; - TBoolVec categoricalFields(weights.size(), false); - categoricalFields.push_back(true); - frame->categoricalColumns(std::move(categoricalFields)); - - for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { - TDoubleVec row(weights.size()); - for (std::size_t j = 0; j < weights.size(); ++j) { - row[j] = regressors[i + j]; - } - - for (std::size_t j = 0; j < row.size() - 1; ++j) { - fieldValues[j] = core::CStringUtils::typeToStringPrecise( - row[j], core::CIEEE754::E_DoublePrecision); - } - fieldValues[weights.size()] = target(row); - targets.push_back(fieldValues[weights.size()]); - - analyzer.handleRecord(fieldNames, fieldValues); - frame->parseAndWriteRow( - core::CVectorRange(fieldValues, 0, weights.size() + 1)); - } - - frame->finishWritingRows(); - - return frame; -} - -enum EPredictionType { E_Regression, E_BinaryClassification }; - -void appendPrediction(core::CDataFrame&, std::size_t, double prediction, double, TDoubleVec& predictions) { - predictions.push_back(prediction); -} - -void appendPrediction(core::CDataFrame& frame, - std::size_t columnHoldingPrediction, - double logOddsClass1, - double threshold, - TStrVec& predictions) { - predictions.push_back( - maths::CTools::logisticFunction(logOddsClass1) < threshold - ? frame.categoricalColumnValues()[columnHoldingPrediction][0] - : frame.categoricalColumnValues()[columnHoldingPrediction][1]); -} - -template -void addPredictionTestData(EPredictionType type, - const TStrVec& fieldNames, - TStrVec fieldValues, - api::CDataFrameAnalyzer& analyzer, - std::vector& expectedPredictions, - std::size_t numberExamples = 100, - double alpha = -1.0, - double lambda = -1.0, - double gamma = -1.0, - double softTreeDepthLimit = -1.0, - double softTreeDepthTolerance = -1.0, - double eta = 0.0, - std::size_t maximumNumberTrees = 0, - double featureBagFraction = 0.0) { - - test::CRandomNumbers rng; - - TDoubleVec weights; - rng.generateUniformSamples(-1.0, 1.0, fieldNames.size() - 3, weights); - TDoubleVec regressors; - rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, regressors); - - TStrVec targets; - auto frame = type == E_Regression - ? setupLinearRegressionData(fieldNames, fieldValues, analyzer, - weights, regressors, targets) - : setupBinaryClassificationData(fieldNames, fieldValues, analyzer, - weights, regressors, targets); - - std::unique_ptr loss; - if (type == E_Regression) { - loss = std::make_unique(); - } else { - loss = std::make_unique(); - } - - maths::CBoostedTreeFactory treeFactory{ - maths::CBoostedTreeFactory::constructFromParameters(1, std::move(loss))}; - if (alpha >= 0.0) { - treeFactory.depthPenaltyMultiplier(alpha); - } - if (lambda >= 0.0) { - treeFactory.leafWeightPenaltyMultiplier(lambda); - } - if (gamma >= 0.0) { - treeFactory.treeSizePenaltyMultiplier(gamma); - } - if (softTreeDepthLimit >= 0.0) { - treeFactory.softTreeDepthLimit(softTreeDepthLimit); - } - if (softTreeDepthTolerance >= 0.0) { - treeFactory.softTreeDepthTolerance(softTreeDepthTolerance); - } - if (eta > 0.0) { - treeFactory.eta(eta); - } - if (maximumNumberTrees > 0) { - treeFactory.maximumNumberTrees(maximumNumberTrees); - } - if (featureBagFraction > 0.0) { - treeFactory.featureBagFraction(featureBagFraction); - } - - ml::api::CDataFrameTrainBoostedTreeInstrumentation instrumentation("testJob"); - treeFactory.analysisInstrumentation(instrumentation); - - auto tree = treeFactory.buildFor(*frame, weights.size()); - - tree->train(); - tree->predict(); - - frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { - for (auto row = beginRows; row != endRows; ++row) { - double prediction{(*row)[tree->columnHoldingPrediction()]}; - appendPrediction(*frame, weights.size(), prediction, - tree->probabilityAtWhichToAssignClassOne(), expectedPredictions); - } - }); -} - template void testOneRunOfBoostedTreeTrainingWithStateRecovery(F makeSpec, std::size_t iterationToRestartFrom) { @@ -311,8 +130,8 @@ void testOneRunOfBoostedTreeTrainingWithStateRecovery(F makeSpec, std::size_t it fieldNames.begin()); TStrVec targets; - auto frame = setupLinearRegressionData(fieldNames, fieldValues, analyzer, - weights, regressors, targets); + auto frame = test::CDataFrameAnalyzerTrainingFactory::setupLinearRegressionData( + fieldNames, fieldValues, analyzer, weights, regressors, targets); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); TStrVec persistedStates{ @@ -332,8 +151,8 @@ void testOneRunOfBoostedTreeTrainingWithStateRecovery(F makeSpec, std::size_t it makeSpec("target", numberExamples, persisterSupplier), outputWriterFactory}; targets.clear(); - setupLinearRegressionData(fieldNames, fieldValues, restoredAnalyzer, - weights, regressors, targets); + test::CDataFrameAnalyzerTrainingFactory::setupLinearRegressionData( + fieldNames, fieldValues, restoredAnalyzer, weights, regressors, targets); restoredAnalyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); persistedStates = splitOnNull(std::stringstream{std::move(persistenceStream->str())}); @@ -386,7 +205,9 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTraining) { test::CDataFrameAnalysisSpecificationFactory::predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; - addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, expectedPredictions); + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, + fieldValues, analyzer, expectedPredictions); core::CStopWatch watch{true}; analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); @@ -450,7 +271,9 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingStateReport) { api::CDataFrameAnalyzer analyzer{ test::CDataFrameAnalysisSpecificationFactory::predictionSpec("regression", "c5"), outputWriterFactory}; - addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, expectedPredictions); + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, + fieldValues, analyzer, expectedPredictions); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); rapidjson::Document results; @@ -488,10 +311,10 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeRegressionTrainingWithParams) { TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; - addPredictionTestData(E_Regression, fieldNames, fieldValues, analyzer, - expectedPredictions, 100, alpha, lambda, gamma, - softTreeDepthLimit, softTreeDepthTolerance, eta, - maximumNumberTrees, featureBagFraction); + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, fieldValues, + analyzer, expectedPredictions, 100, alpha, lambda, gamma, softTreeDepthLimit, + softTreeDepthTolerance, eta, maximumNumberTrees, featureBagFraction); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); // Check best hyperparameters @@ -675,8 +498,9 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierTraining) { test::CDataFrameAnalysisSpecificationFactory::classification(), "target", 100, 5, 6000000, 0, 0, {"target"}), outputWriterFactory}; - addPredictionTestData(E_BinaryClassification, fieldNames, fieldValues, - analyzer, expectedPredictions); + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, + fieldNames, fieldValues, analyzer, expectedPredictions); core::CStopWatch watch{true}; analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); @@ -763,8 +587,8 @@ BOOST_AUTO_TEST_CASE(testRunBoostedTreeClassifierImbalanced) { outputWriterFactory}; TStrVec actuals; - setupBinaryClassificationData(fieldNames, fieldValues, analyzer, weights, - regressors, actuals); + test::CDataFrameAnalyzerTrainingFactory::setupBinaryClassificationData( + fieldNames, fieldValues, analyzer, weights, regressors, actuals); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "$"}); rapidjson::Document results; diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 415df828c5..9d1a14db9e 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -151,12 +151,6 @@ CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, m_Regularization, m_DownsampleFactor, m_Eta, m_EtaGrowthRatePerTree, m_MaximumNumberTrees, m_FeatureBagFraction}, m_Instrumentation{instrumentation != nullptr ? instrumentation : &INSTRUMENTATION_STUB} { - if (std::find(REGRESSION_LOSSES.begin(), REGRESSION_LOSSES.end(), - m_Loss->name()) != REGRESSION_LOSSES.end()) { - m_Instrumentation->type(CDataFrameTrainBoostedTreeInstrumentationInterface::E_Regression); - } else { - m_Instrumentation->type(CDataFrameTrainBoostedTreeInstrumentationInterface::E_Classification); - } } CBoostedTreeImpl::CBoostedTreeImpl() = default; @@ -177,6 +171,13 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, HANDLE_FATAL(<< "Internal error: must supply a loss function. Please report this problem."); } + if (std::find(REGRESSION_LOSSES.begin(), REGRESSION_LOSSES.end(), + m_Loss->name()) != REGRESSION_LOSSES.end()) { + m_Instrumentation->type(CDataFrameTrainBoostedTreeInstrumentationInterface::E_Regression); + } else { + m_Instrumentation->type(CDataFrameTrainBoostedTreeInstrumentationInterface::E_Classification); + } + LOG_TRACE(<< "Main training loop..."); m_TrainingProgress.progressCallback(m_Instrumentation->progressCallback()); diff --git a/lib/test/CDataFrameAnalyzerTrainingFactory.cc b/lib/test/CDataFrameAnalyzerTrainingFactory.cc new file mode 100644 index 0000000000..984ae340e9 --- /dev/null +++ b/lib/test/CDataFrameAnalyzerTrainingFactory.cc @@ -0,0 +1,116 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +namespace ml { +namespace test { + +void CDataFrameAnalyzerTrainingFactory::appendPrediction(core::CDataFrame&, + std::size_t, + double prediction, + double, + TDoubleVec& predictions) { + predictions.push_back(prediction); +} + +void CDataFrameAnalyzerTrainingFactory::appendPrediction(core::CDataFrame& frame, + std::size_t columnHoldingPrediction, + double logOddsClass1, + double threshold, + TStrVec& predictions) { + predictions.push_back( + maths::CTools::logisticFunction(logOddsClass1) < threshold + ? frame.categoricalColumnValues()[columnHoldingPrediction][0] + : frame.categoricalColumnValues()[columnHoldingPrediction][1]); +} + +CDataFrameAnalyzerTrainingFactory::TDataFrameUPtr +CDataFrameAnalyzerTrainingFactory::setupLinearRegressionData(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& regressors, + TStrVec& targets) { + + auto target = [&weights](const TDoubleVec& regressors_) { + double result{0.0}; + for (std::size_t i = 0; i < weights.size(); ++i) { + result += weights[i] * regressors_[i]; + } + return core::CStringUtils::typeToStringPrecise(result, core::CIEEE754::E_DoublePrecision); + }; + + auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; + + for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { + TDoubleVec row(weights.size()); + for (std::size_t j = 0; j < weights.size(); ++j) { + row[j] = regressors[i + j]; + } + + for (std::size_t j = 0; j < row.size(); ++j) { + fieldValues[j] = core::CStringUtils::typeToStringPrecise( + row[j], core::CIEEE754::E_DoublePrecision); + } + fieldValues[weights.size()] = target(row); + targets.push_back(fieldValues[weights.size()]); + + analyzer.handleRecord(fieldNames, fieldValues); + frame->parseAndWriteRow( + core::CVectorRange(fieldValues, 0, weights.size() + 1)); + } + + frame->finishWritingRows(); + + return frame; +} + +CDataFrameAnalyzerTrainingFactory::TDataFrameUPtr +CDataFrameAnalyzerTrainingFactory::setupBinaryClassificationData(const TStrVec& fieldNames, + TStrVec& fieldValues, + api::CDataFrameAnalyzer& analyzer, + const TDoubleVec& weights, + const TDoubleVec& regressors, + TStrVec& targets) { + TStrVec classes{"foo", "bar"}; + auto target = [&weights, &classes](const TDoubleVec& regressors_) { + double result{0.0}; + for (std::size_t i = 0; i < weights.size(); ++i) { + result += weights[i] * regressors_[i]; + } + return classes[result < 0.0 ? 0 : 1]; + }; + + auto frame = core::makeMainStorageDataFrame(weights.size() + 1).first; + TBoolVec categoricalFields(weights.size(), false); + categoricalFields.push_back(true); + frame->categoricalColumns(std::move(categoricalFields)); + + for (std::size_t i = 0; i < regressors.size(); i += weights.size()) { + TDoubleVec row(weights.size()); + for (std::size_t j = 0; j < weights.size(); ++j) { + row[j] = regressors[i + j]; + } + + for (std::size_t j = 0; j < row.size() - 1; ++j) { + fieldValues[j] = core::CStringUtils::typeToStringPrecise( + row[j], core::CIEEE754::E_DoublePrecision); + } + fieldValues[weights.size()] = target(row); + targets.push_back(fieldValues[weights.size()]); + + analyzer.handleRecord(fieldNames, fieldValues); + frame->parseAndWriteRow( + core::CVectorRange(fieldValues, 0, weights.size() + 1)); + } + + frame->finishWritingRows(); + + return frame; +} +} +} \ No newline at end of file diff --git a/lib/test/Makefile b/lib/test/Makefile index c3d1d3acdc..f94a0ca245 100644 --- a/lib/test/Makefile +++ b/lib/test/Makefile @@ -21,6 +21,7 @@ SRCS= \ $(OS_SRCS) \ CBoostTestXmlOutput.cc \ CDataFrameAnalysisSpecificationFactory.cc \ + CDataFrameAnalyzerTrainingFactory.cc \ CMultiFileDataAdder.cc \ CMultiFileSearcher.cc \ CRandomNumbers.cc \ From 60e3556849450db873cfb46d2444eee1f901eb3e Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 27 Feb 2020 15:33:59 +0100 Subject: [PATCH 12/40] minor refactorings --- include/api/CDataFrameAnalysisInstrumentation.h | 3 +-- include/maths/CDataFrameAnalysisInstrumentationInterface.h | 2 -- lib/api/CDataFrameAnalysisInstrumentation.cc | 4 ---- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index fe1b0b1487..c87202fbcb 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -81,7 +81,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation private: void writeMemory(std::int64_t timestamp); - virtual void writeAnalysisStats(std::int64_t /* timestamp */){}; + virtual void writeAnalysisStats(std::int64_t timestamp) = 0; virtual void writeState(); private: @@ -119,7 +119,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final void lossType(const std::string& lossType) override; void lossValues(std::string fold, TDoubleVec&& lossValues) override; void numFolds(std::size_t numFolds) override; - void hyperparameters(const SHyperparameters& hyperparameters) override; SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; protected: diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 8a9d1f796e..2e2de4b04c 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -106,7 +106,6 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface virtual void lossType(const std::string& lossType) = 0; virtual void lossValues(std::string fold, TDoubleVec&& lossValues) = 0; virtual void numFolds(std::size_t numFolds) = 0; - virtual void hyperparameters(const SHyperparameters& hyperparameters) = 0; virtual SHyperparameters& hyperparameters() = 0; }; @@ -132,7 +131,6 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final void lossType(const std::string& /* lossType */) override{}; void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override{}; void numFolds(std::size_t /* numFolds */) override{}; - void hyperparameters(const SHyperparameters& /* hyperparameters */) override{}; SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; private: diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index a1952b2ebc..5d66273a3d 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -195,10 +195,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::numFolds(std::size_t numFolds) { m_NumFolds = numFolds; } -void CDataFrameTrainBoostedTreeInstrumentation::hyperparameters(const SHyperparameters& hyperparameters) { - m_Hyperparameters = hyperparameters; -} - void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { auto* writer{this->writer()}; if (writer != nullptr) { From 38032fd3892627d0680627f8ce21cbf7f144dabb Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 27 Feb 2020 15:43:30 +0100 Subject: [PATCH 13/40] minor refactorings, add reset --- include/api/CDataFrameAnalysisInstrumentation.h | 3 ++- include/test/CDataFrameAnalyzerTrainingFactory.h | 7 +++---- lib/api/CDataFrameAnalysisInstrumentation.cc | 8 ++++++-- lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc | 1 + lib/maths/CBoostedTreeImpl.cc | 8 ++------ 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index c87202fbcb..95e437b9a7 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -132,12 +132,13 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final void writeHyperparameters(rapidjson::Value& parentObject); void writeValidationLoss(rapidjson::Value& parentObject); void writeTimingStats(rapidjson::Value& parentObject); + void reset(); private: EStatsType m_Type; std::size_t m_Iteration; std::uint64_t m_IterationTime; - std::uint64_t m_ElapsedTime; + std::uint64_t m_ElapsedTime = 0; std::string m_LossType; TLossMap m_LossValues; std::size_t m_NumFolds; diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h index d70bf76c65..97ce69ba63 100644 --- a/include/test/CDataFrameAnalyzerTrainingFactory.h +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -7,16 +7,15 @@ #ifndef INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h #define INCLUDED_ml_test_CDataFrameAnalyzerTrainingFactory_h -#include -#include - #include #include #include -#include +#include +#include +#include #include #include diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 5d66273a3d..f420724a48 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -230,6 +230,12 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t writer->EndObject(); writer->EndObject(); } + this->reset(); +} + +void CDataFrameTrainBoostedTreeInstrumentation::reset() { + // Clear the map of loss values before the next iteration + m_LossValues.clear(); } void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) { @@ -305,7 +311,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::V if (writer != nullptr) { writer->addMember(VALIDATION_LOSS_TYPE_TAG, m_LossType, parentObject); rapidjson::Value lossValuesObject{writer->makeObject()}; - // writer->StartObject(); for (auto& element : m_LossValues) { rapidjson::Value array{writer->makeArray(element.second.size())}; for (double lossValue : element.second) { @@ -315,7 +320,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::V writer->addMember(element.first, array, lossValuesObject); } writer->addMember(VALIDATION_LOSS_VALUES_TAG, lossValuesObject, parentObject); - // writer->EndObject(); } } void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& parentObject) { diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 74040c33b5..072c47d07c 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -133,6 +133,7 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { } } } + // TODO add memory format test } BOOST_AUTO_TEST_CASE(testTrainingClassification) { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 9d1a14db9e..596cf22628 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -4,9 +4,6 @@ * you may not use this file except in compliance with the Elastic License. */ -#include "maths/CBoostedTree.h" -#include "maths/CDataFrameAnalysisInstrumentationInterface.h" -#include #include #include @@ -19,13 +16,14 @@ #include #include +#include #include +#include #include #include #include #include #include -#include namespace ml { namespace maths { @@ -42,10 +40,8 @@ namespace { // quantiles anyway. So we amortise their compute cost w.r.t. training trees // by only refreshing once every MINIMUM_SPLIT_REFRESH_INTERVAL trees we add. const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; - const std::string HYPERPARAMETER_OPTIMIZATION_PHASE{"hyperparameter_optimization"}; const std::string TRAINING_FINAL_TREE_PHASE{"training_final_tree"}; - const std::array REGRESSION_LOSSES{CMse::NAME}; //! \brief Record the memory used by a supplied object using the RAII idiom. From 8ff041f2e2514836cf18603de9b57b0f24ea9a02 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 28 Feb 2020 09:53:46 +0100 Subject: [PATCH 14/40] Formatting and adding docs --- include/api/CDataFrameAnalysisInstrumentation.h | 17 +++++++++++++++-- ...CDataFrameAnalysisInstrumentationInterface.h | 17 +++++++++++++---- .../test/CDataFrameAnalyzerTrainingFactory.h | 2 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 4 ---- .../CDataFrameAnalysisInstrumentationTest.cc | 4 ++-- lib/api/unittest/CDataFrameMockAnalysisRunner.h | 1 + 6 files changed, 32 insertions(+), 13 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 95e437b9a7..6f6f8fba8f 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -35,6 +35,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation using TRapidJsonWriter = core::CRapidJsonConcurrentLineWriter; public: + //! Constructs an intrumentation object an analytics job with a given \p jobId. explicit CDataFrameAnalysisInstrumentation(const std::string& jobId); //! Adds \p delta to the memory usage statistics. @@ -68,11 +69,13 @@ class API_EXPORT CDataFrameAnalysisInstrumentation //! Trigger the next step of the job. This will initiate writing the job state //! to the results pipe. + //! \todo use \p phase to tag different phases of the analysis job. void nextStep(const std::string& phase = "") override; //! \return The peak memory usage. std::int64_t memory() const; + //! \return The id of the data frame analytics job. const std::string& jobId() const; protected: @@ -92,6 +95,7 @@ class API_EXPORT CDataFrameAnalysisInstrumentation std::string m_JobId; }; +//! \brief Instrumentation class for Outlier Detection jobs. class API_EXPORT CDataFrameOutliersInstrumentation final : public CDataFrameAnalysisInstrumentation, public maths::CDataFrameOutliersInstrumentationInterface { @@ -106,6 +110,11 @@ class API_EXPORT CDataFrameOutliersInstrumentation final void writeAnalysisStats(std::int64_t timestamp) override; }; +//! \brief Instrumentation class for Supervised Learning jobs. +//! +//! DESCRIPTION:\n +//! This class extends CDataFrameAnalysisInstrumentation with a setters +//! for hyperparameters, validatioin loss results, and job timing. class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final : public CDataFrameAnalysisInstrumentation, public maths::CDataFrameTrainBoostedTreeInstrumentationInterface { @@ -113,12 +122,17 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) : CDataFrameAnalysisInstrumentation(jobId){}; + //! Supevised learning job \p type, can be E_Regression or E_Classification. void type(EStatsType type) override; + //! Current \p iteration number. void iteration(std::size_t iteration) override; + //! Run time of the iteration. void iterationTime(std::uint64_t delta) override; + //! Type of the validation loss result, e.g. "mse". void lossType(const std::string& lossType) override; + //! List of \p lossValues of validation error for the given \p fold. void lossValues(std::string fold, TDoubleVec&& lossValues) override; - void numFolds(std::size_t numFolds) override; + //! \return Strucutre contains hyperparameters. SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; protected: @@ -141,7 +155,6 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final std::uint64_t m_ElapsedTime = 0; std::string m_LossType; TLossMap m_LossValues; - std::size_t m_NumFolds; SHyperparameters m_Hyperparameters; }; } diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 2e2de4b04c..2ac58952df 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -61,6 +61,11 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationInterface { class MATHS_EXPORT CDataFrameOutliersInstrumentationInterface : virtual public CDataFrameAnalysisInstrumentationInterface {}; +//! \brief Instrumentation interface for Supervised Learning jobs. +//! +//! DESCRIPTION:\n +//! This interface extends CDataFrameAnalysisInstrumentationInterface with a setters +//! for hyperparameters, validatioin loss results, and job timing. class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface : virtual public CDataFrameAnalysisInstrumentationInterface { public: @@ -100,16 +105,21 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface public: virtual ~CDataFrameTrainBoostedTreeInstrumentationInterface() = default; + //! Supevised learning job \p type, can be E_Regression or E_Classification. virtual void type(EStatsType type) = 0; + //! Current \p iteration number. virtual void iteration(std::size_t iteration) = 0; + //! Run time of the iteration. virtual void iterationTime(std::uint64_t delta) = 0; + //! Type of the validation loss result, e.g. "mse". virtual void lossType(const std::string& lossType) = 0; + //! List of \p lossValues of validation error for the given \p fold. virtual void lossValues(std::string fold, TDoubleVec&& lossValues) = 0; - virtual void numFolds(std::size_t numFolds) = 0; + //! \return Strucutre contains hyperparameters. virtual SHyperparameters& hyperparameters() = 0; }; -//! \brief Dummies out all instrumentation. +//! \brief Dummies out all instrumentation for outlier detection. class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final : public CDataFrameOutliersInstrumentationInterface { public: @@ -118,7 +128,7 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final void nextStep(const std::string& /* phase */) override {} }; -//! \brief Dummies out all instrumentation. +//! \brief Dummies out all instrumentation for supervised learning. class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final : public CDataFrameTrainBoostedTreeInstrumentationInterface { public: @@ -130,7 +140,6 @@ class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final void iterationTime(std::uint64_t /* delta */) override{}; void lossType(const std::string& /* lossType */) override{}; void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override{}; - void numFolds(std::size_t /* numFolds */) override{}; SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; private: diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h index 97ce69ba63..9dd9e33ceb 100644 --- a/include/test/CDataFrameAnalyzerTrainingFactory.h +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -23,7 +23,7 @@ namespace ml { namespace test { - +//! \brief Collection of helping methods to create regression and classification data for tests. class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { public: enum EPredictionType { E_Regression, E_BinaryClassification }; diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index f420724a48..8282826d53 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -191,10 +191,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::string fold, m_LossValues.emplace(std::make_pair(fold, lossValues)); } -void CDataFrameTrainBoostedTreeInstrumentation::numFolds(std::size_t numFolds) { - m_NumFolds = numFolds; -} - void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { auto* writer{this->writer()}; if (writer != nullptr) { diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 072c47d07c..668a8eefa0 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -174,8 +174,8 @@ BOOST_AUTO_TEST_CASE(testTrainingClassification) { for (const auto& result : results.GetArray()) { if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); - if (result["analysis_stats"]["regression_stats"].Accept(validator) == false) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); + if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { rapidjson::StringBuffer sb; validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); LOG_ERROR(<< "Invalid schema: " << sb.GetString()); diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.h b/lib/api/unittest/CDataFrameMockAnalysisRunner.h index b35f9e4a5f..b102d96569 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.h +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.h @@ -19,6 +19,7 @@ class CDataFrameMockAnalysisState final : public ml::api::CDataFrameAnalysisInst public: CDataFrameMockAnalysisState(const std::string& jobId) : ml::api::CDataFrameAnalysisInstrumentation(jobId) {} + void writeAnalysisStats(std::int64_t /* timestamp */) override{}; protected: ml::counter_t::ECounterTypes memoryCounterType() override; From 32af0d8e424a9708e3a2ca63c9f5fdaf2a2329ab Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 28 Feb 2020 09:56:34 +0100 Subject: [PATCH 15/40] Add Enhancement note --- docs/CHANGELOG.asciidoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 05c035cb0b..089c413b66 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -56,6 +56,8 @@ necessary. This will improve the allocation of data frame analyses to cluster no (See {ml-pull}1003[#1003].) * Upgrade the compiler used on Linux from gcc 7.3 to gcc 7.5, and the binutils used in the build from version 2.20 to 2.34. (See {ml-pull}1013[#1013].) +* Add instrumentation information for supervised learning data frame analytics jobs. +(See {ml-pull}1031[#1031].) === Bug Fixes From 5c11a61e3b785806b5e70889dfee1731e23977bc Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 28 Feb 2020 11:30:57 +0100 Subject: [PATCH 16/40] Rename stub and formatting --- include/maths/CDataFrameAnalysisInstrumentationInterface.h | 2 +- lib/maths/CBoostedTreeImpl.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 2ac58952df..7a7dd6b9c8 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -129,7 +129,7 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final }; //! \brief Dummies out all instrumentation for supervised learning. -class MATHS_EXPORT CDataFrameAnalysisInstrumentationStub final +class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub final : public CDataFrameTrainBoostedTreeInstrumentationInterface { public: void updateMemoryUsage(std::int64_t) override {} diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 596cf22628..96b901a21c 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -136,7 +136,7 @@ double trace(std::size_t columns, const TMemoryMappedFloatVector& upperTriangle) return result; } -CDataFrameAnalysisInstrumentationStub INSTRUMENTATION_STUB; +CDataFrameTrainBoostedTreeInstrumentationStub INSTRUMENTATION_STUB; } CBoostedTreeImpl::CBoostedTreeImpl(std::size_t numberThreads, From 8f25c079acb3f5f3d93c83a10fab893e41797762 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 28 Feb 2020 11:47:45 +0100 Subject: [PATCH 17/40] formatting --- include/maths/CBoostedTreeImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/maths/CBoostedTreeImpl.h b/include/maths/CBoostedTreeImpl.h index 98264d1dc7..1f31a041f6 100644 --- a/include/maths/CBoostedTreeImpl.h +++ b/include/maths/CBoostedTreeImpl.h @@ -289,7 +289,7 @@ class MATHS_EXPORT CBoostedTreeImpl final { //! Record the training state using the \p recordTrainState callback function void recordState(const TTrainingStateCallback& recordTrainState) const; - + //! Record hyperparameters for instrumentation. void recordHyperparameters(); From 7a50fdc8e67a8592ff65dd80b8f2516ba2190db7 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 28 Feb 2020 15:09:35 +0100 Subject: [PATCH 18/40] fix missing header --- include/test/CDataFrameAnalyzerTrainingFactory.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h index 9dd9e33ceb..5d7708f1dd 100644 --- a/include/test/CDataFrameAnalyzerTrainingFactory.h +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -10,6 +10,7 @@ #include #include +#include #include #include From fac222e8bb5cd2ab719597c94ecd68b7d047e4e9 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Tue, 10 Mar 2020 10:54:50 +0100 Subject: [PATCH 19/40] reviewers comments --- .../api/CDataFrameAnalysisInstrumentation.h | 8 +-- ...ataFrameAnalysisInstrumentationInterface.h | 42 +++++++------- lib/api/CDataFrameAnalysisInstrumentation.cc | 56 ++++++++++--------- lib/maths/CBoostedTreeImpl.cc | 2 +- lib/test/CDataFrameAnalyzerTrainingFactory.cc | 2 +- 5 files changed, 56 insertions(+), 54 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 6f6f8fba8f..58d3429edd 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -113,8 +113,8 @@ class API_EXPORT CDataFrameOutliersInstrumentation final //! \brief Instrumentation class for Supervised Learning jobs. //! //! DESCRIPTION:\n -//! This class extends CDataFrameAnalysisInstrumentation with a setters -//! for hyperparameters, validatioin loss results, and job timing. +//! This class extends CDataFrameAnalysisInstrumentation with setters +//! for hyperparameters, validation loss results, and job timing. class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final : public CDataFrameAnalysisInstrumentation, public maths::CDataFrameTrainBoostedTreeInstrumentationInterface { @@ -122,7 +122,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) : CDataFrameAnalysisInstrumentation(jobId){}; - //! Supevised learning job \p type, can be E_Regression or E_Classification. + //! Supervised learning job \p type, can be E_Regression or E_Classification. void type(EStatsType type) override; //! Current \p iteration number. void iteration(std::size_t iteration) override; @@ -132,7 +132,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final void lossType(const std::string& lossType) override; //! List of \p lossValues of validation error for the given \p fold. void lossValues(std::string fold, TDoubleVec&& lossValues) override; - //! \return Strucutre contains hyperparameters. + //! \return Structure contains hyperparameters. SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; protected: diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 7a7dd6b9c8..951356406c 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -65,7 +65,7 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationInterface //! //! DESCRIPTION:\n //! This interface extends CDataFrameAnalysisInstrumentationInterface with a setters -//! for hyperparameters, validatioin loss results, and job timing. +//! for hyperparameters, validation loss results, and job timing. class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface : virtual public CDataFrameAnalysisInstrumentationInterface { public: @@ -81,25 +81,25 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance}, s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier}, s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}; - double s_DepthPenaltyMultiplier = 0.0; - double s_SoftTreeDepthLimit = 0.0; - double s_SoftTreeDepthTolerance = 0.0; - double s_TreeSizePenaltyMultiplier = 0.0; - double s_LeafWeightPenaltyMultiplier = 0.0; + double s_DepthPenaltyMultiplier = -1.0; + double s_SoftTreeDepthLimit = -1.0; + double s_SoftTreeDepthTolerance = -1.0; + double s_TreeSizePenaltyMultiplier = -1.0; + double s_LeafWeightPenaltyMultiplier = -1.0; }; struct SHyperparameters { - double s_Eta = 0.1; + double s_Eta = -1.0; CBoostedTree::EClassAssignmentObjective s_ClassAssignmentObjective = CBoostedTree::E_MinimumRecall; SRegularization s_Regularization; - double s_DownsampleFactor = 0.5; - std::size_t s_NumFolds = 4; - std::size_t s_MaxTrees = 20; - double s_FeatureBagFraction = 0.5; - double s_EtaGrowthRatePerTree = 1.05; - std::size_t s_MaxAttemptsToAddTree = 3; - std::size_t s_NumSplitsPerFeature = 75; - std::size_t s_MaxOptimizationRoundsPerHyperparameter = 2; + double s_DownsampleFactor = -1.0; + std::size_t s_NumFolds = 0; + std::size_t s_MaxTrees = 0; + double s_FeatureBagFraction = -1.0; + double s_EtaGrowthRatePerTree = -1.0; + std::size_t s_MaxAttemptsToAddTree = 0; + std::size_t s_NumSplitsPerFeature = 0; + std::size_t s_MaxOptimizationRoundsPerHyperparameter = 0; }; using TDoubleVec = std::vector; @@ -135,12 +135,12 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub final void updateMemoryUsage(std::int64_t) override {} void updateProgress(double) override {} void nextStep(const std::string& /* phase */) override {} - void type(EStatsType /* type */) override{}; - void iteration(std::size_t /* iteration */) override{}; - void iterationTime(std::uint64_t /* delta */) override{}; - void lossType(const std::string& /* lossType */) override{}; - void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override{}; - SHyperparameters& hyperparameters() override { return m_Hyperparameters; }; + void type(EStatsType /* type */) override {} + void iteration(std::size_t /* iteration */) override {} + void iterationTime(std::uint64_t /* delta */) override {} + void lossType(const std::string& /* lossType */) override {} + void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override {} + SHyperparameters& hyperparameters() override { return m_Hyperparameters; } private: SHyperparameters m_Hyperparameters; diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 8282826d53..1c47f9a535 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -235,71 +235,73 @@ void CDataFrameTrainBoostedTreeInstrumentation::reset() { } void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) { - if (this->writer() != nullptr) { + auto* writer{this->writer()}; + + if (writer != nullptr) { - this->writer()->addMember( - ETA_TAG, rapidjson::Value(this->m_Hyperparameters.s_Eta).Move(), parentObject); + writer->addMember(ETA_TAG, + rapidjson::Value(this->m_Hyperparameters.s_Eta).Move(), + parentObject); if (m_Type == E_Classification) { - this->writer()->addMember( - CLASS_ASSIGNMENT_OBJECTIVE_TAG, - CLASS_ASSIGNMENT_OBJECTIVE[this->m_Hyperparameters.s_ClassAssignmentObjective], - parentObject); + writer->addMember(CLASS_ASSIGNMENT_OBJECTIVE_TAG, + CLASS_ASSIGNMENT_OBJECTIVE[this->m_Hyperparameters.s_ClassAssignmentObjective], + parentObject); } - this->writer()->addMember( + writer->addMember( REGULARIZATION_DEPTH_PENALTY_MULTIPLIER_TAG, rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_DepthPenaltyMultiplier) .Move(), parentObject); - this->writer()->addMember( + writer->addMember( REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG, rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthLimit) .Move(), parentObject); - this->writer()->addMember( + writer->addMember( REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG, rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_SoftTreeDepthTolerance) .Move(), parentObject); - this->writer()->addMember( + writer->addMember( REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG, rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_TreeSizePenaltyMultiplier) .Move(), parentObject); - this->writer()->addMember( + writer->addMember( REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG, rapidjson::Value(this->m_Hyperparameters.s_Regularization.s_LeafWeightPenaltyMultiplier) .Move(), parentObject); - this->writer()->addMember( + writer->addMember( DOWNSAMPLE_FACTOR_TAG, rapidjson::Value(this->m_Hyperparameters.s_DownsampleFactor).Move(), parentObject); - this->writer()->addMember( - NUM_FOLDS_TAG, - rapidjson::Value(this->m_Hyperparameters.s_NumFolds).Move(), parentObject); - this->writer()->addMember( - MAX_TREES_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxTrees).Move(), parentObject); - this->writer()->addMember( + writer->addMember(NUM_FOLDS_TAG, + rapidjson::Value(this->m_Hyperparameters.s_NumFolds).Move(), + parentObject); + writer->addMember(MAX_TREES_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxTrees).Move(), + parentObject); + writer->addMember( FEATURE_BAG_FRACTION_TAG, rapidjson::Value(this->m_Hyperparameters.s_FeatureBagFraction).Move(), parentObject); - this->writer()->addMember( + writer->addMember( ETA_GROWTH_RATE_PER_TREE_TAG, rapidjson::Value(this->m_Hyperparameters.s_EtaGrowthRatePerTree).Move(), parentObject); - this->writer()->addMember( + writer->addMember( MAX_ATTEMPTS_TO_ADD_TREE_TAG, rapidjson::Value(this->m_Hyperparameters.s_MaxAttemptsToAddTree).Move(), parentObject); - this->writer()->addMember( + writer->addMember( NUM_SPLITS_PER_FEATURE_TAG, rapidjson::Value(this->m_Hyperparameters.s_NumSplitsPerFeature).Move(), parentObject); - this->writer()->addMember(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter) - .Move(), - parentObject); + writer->addMember(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, + rapidjson::Value(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter) + .Move(), + parentObject); } } void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::Value& parentObject) { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 05d9404235..e198d9fa38 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -582,7 +582,7 @@ CBoostedTreeImpl::trainForest(core::CDataFrame& frame, LOG_TRACE(<< "Trained one forest"); - return {forest, stoppingCondition.bestLoss(), losses}; + return {forest, stoppingCondition.bestLoss(), std::move(losses)}; } core::CPackedBitVector diff --git a/lib/test/CDataFrameAnalyzerTrainingFactory.cc b/lib/test/CDataFrameAnalyzerTrainingFactory.cc index 984ae340e9..e33945a8e1 100644 --- a/lib/test/CDataFrameAnalyzerTrainingFactory.cc +++ b/lib/test/CDataFrameAnalyzerTrainingFactory.cc @@ -113,4 +113,4 @@ CDataFrameAnalyzerTrainingFactory::setupBinaryClassificationData(const TStrVec& return frame; } } -} \ No newline at end of file +} From ef63e128b9bca5f6acdb73e49691bff3378f5a03 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Tue, 10 Mar 2020 12:55:01 +0100 Subject: [PATCH 20/40] Fix merge conflicts --- include/api/CDataFrameAnalysisInstrumentation.h | 17 +++++++---------- lib/api/CDataFrameAnalysisInstrumentation.cc | 16 ++++++++-------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 3ceba4a313..4e3f111a2b 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -79,9 +79,6 @@ class API_EXPORT CDataFrameAnalysisInstrumentation //! Reset variables related to the job progress. void resetProgress(); - //! Set pointer to the writer object. - void writer(TRapidJsonWriter* writer); - //! Trigger the next step of the job. This will initiate writing the job state //! to the results pipe. //! \todo use \p phase to tag different phases of the analysis job. @@ -94,11 +91,12 @@ class API_EXPORT CDataFrameAnalysisInstrumentation const std::string& jobId() const; protected: - virtual counter_t::ECounterTypes memoryCounterType() = 0; - TRapidJsonWriter* writer(); + using TWriter = core::CRapidJsonConcurrentLineWriter; + using TWriterUPtr = std::unique_ptr; -private: - using TWriterUPtr = std::unique_ptr; +protected: + virtual counter_t::ECounterTypes memoryCounterType() = 0; + TWriter* writer(); private: void writeMemory(std::int64_t timestamp); @@ -111,7 +109,6 @@ class API_EXPORT CDataFrameAnalysisInstrumentation std::atomic_size_t m_FractionalProgress; std::atomic m_Memory; TWriterUPtr m_Writer; - std::string m_JobId; }; //! \brief Instrumentation class for Outlier Detection jobs. @@ -120,7 +117,7 @@ class API_EXPORT CDataFrameOutliersInstrumentation final public maths::CDataFrameOutliersInstrumentationInterface { public: explicit CDataFrameOutliersInstrumentation(const std::string& jobId) - : CDataFrameAnalysisInstrumentation(jobId){} + : CDataFrameAnalysisInstrumentation(jobId) {} protected: counter_t::ECounterTypes memoryCounterType() override; @@ -139,7 +136,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final public maths::CDataFrameTrainBoostedTreeInstrumentationInterface { public: explicit CDataFrameTrainBoostedTreeInstrumentation(const std::string& jobId) - : CDataFrameAnalysisInstrumentation(jobId){} + : CDataFrameAnalysisInstrumentation(jobId) {} //! Supervised learning job \p type, can be E_Regression or E_Classification. void type(EStatsType type) override; diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index ef83ba43c6..6eeb998616 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -95,7 +95,7 @@ double CDataFrameAnalysisInstrumentation::progress() const { } CDataFrameAnalysisInstrumentation::CDataFrameAnalysisInstrumentation(const std::string& jobId) - : m_JobId{jobId}, m_Finished{false}, m_FractionalProgress{0}, m_Memory{0}, m_Writer{nullptr}, m_JobId{jobId} { + : m_JobId{jobId}, m_Finished{false}, m_FractionalProgress{0}, m_Memory{0}, m_Writer{nullptr} { } void CDataFrameAnalysisInstrumentation::resetProgress() { @@ -140,8 +140,8 @@ const std::string& CDataFrameAnalysisInstrumentation::jobId() const { return m_JobId; } -core::CRapidJsonConcurrentLineWriter* CDataFrameAnalysisInstrumentation::writer() { - return m_Writer; +CDataFrameAnalysisInstrumentation::TWriter* CDataFrameAnalysisInstrumentation::writer() { + return m_Writer.get(); } counter_t::ECounterTypes CDataFrameOutliersInstrumentation::memoryCounterType() { @@ -153,7 +153,7 @@ counter_t::ECounterTypes CDataFrameTrainBoostedTreeInstrumentation::memoryCounte } void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestamp) { - auto* writer{this->writer()}; + auto writer = this->writer(); if (writer != nullptr) { writer->StartObject(); writer->Key(JOB_ID_TAG); @@ -187,7 +187,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::string fold, } void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { - auto* writer{this->writer()}; + auto* writer = this->writer(); if (writer != nullptr) { writer->StartObject(); writer->Key(JOB_ID_TAG); @@ -230,7 +230,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::reset() { } void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson::Value& parentObject) { - auto* writer{this->writer()}; + auto* writer = this->writer(); if (writer != nullptr) { @@ -300,7 +300,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson:: } } void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::Value& parentObject) { - auto* writer{this->writer()}; + auto* writer = this->writer(); if (writer != nullptr) { writer->addMember(VALIDATION_LOSS_TYPE_TAG, m_LossType, parentObject); rapidjson::Value lossValuesObject{writer->makeObject()}; @@ -316,7 +316,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::V } } void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& parentObject) { - auto* writer{this->writer()}; + auto* writer = this->writer(); if (writer != nullptr) { writer->addMember(TIMING_ELAPSED_TIME_TAG, rapidjson::Value(m_ElapsedTime).Move(), parentObject); From 4d6cf906d52d58b59016eca9254604e42412b047 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 11 Mar 2020 12:28:53 +0100 Subject: [PATCH 21/40] fix unit test after merge --- .../api/CDataFrameAnalysisInstrumentation.h | 9 ++- .../CDataFrameAnalysisInstrumentationTest.cc | 67 +++++++------------ 2 files changed, 30 insertions(+), 46 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 4e3f111a2b..8d57359c9c 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -33,7 +33,8 @@ namespace api { class API_EXPORT CDataFrameAnalysisInstrumentation : virtual public maths::CDataFrameAnalysisInstrumentationInterface { public: - using TRapidJsonWriter = core::CRapidJsonConcurrentLineWriter; + using TWriter = core::CRapidJsonConcurrentLineWriter; + using TWriterUPtr = std::unique_ptr; //! \brief Set the output stream for the lifetime of this object. class API_EXPORT CScopeSetOutputStream { @@ -90,13 +91,11 @@ class API_EXPORT CDataFrameAnalysisInstrumentation //! \return The id of the data frame analytics job. const std::string& jobId() const; -protected: - using TWriter = core::CRapidJsonConcurrentLineWriter; - using TWriterUPtr = std::unique_ptr; + // TODO move to protected + TWriter* writer(); protected: virtual counter_t::ECounterTypes memoryCounterType() = 0; - TWriter* writer(); private: void writeMemory(std::int64_t timestamp); diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 668a8eefa0..f07645695f 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -32,56 +32,37 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { std::string jobId{"testJob"}; std::int64_t memoryUsage{1000}; std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - std::stringstream outpustStream; - { - core::CJsonOutputStreamWrapper streamWrapper(outpustStream); - core::CRapidJsonConcurrentLineWriter writer(streamWrapper); - api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); - instrumentation.updateMemoryUsage(memoryUsage); - instrumentation.writer(&writer); - instrumentation.nextStep(0); - outpustStream.flush(); - } - std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(outpustStream.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - BOOST_TEST_REQUIRE(results.IsArray() == true); - - const auto& result{results[0]}; - BOOST_TEST_REQUIRE(result["job_id"].GetString() == jobId); - BOOST_TEST_REQUIRE(result["type"].GetString() == "analytics_memory_usage"); - BOOST_TEST_REQUIRE(result["peak_usage_bytes"].GetInt64() == memoryUsage); - BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() >= timeBefore); - BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() <= timeAfter); -} - -BOOST_AUTO_TEST_CASE(testAnalysisTrainState) { - std::string jobId{"testJob"}; - std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; std::stringstream outputStream; { core::CJsonOutputStreamWrapper streamWrapper(outputStream); - core::CRapidJsonConcurrentLineWriter writer(streamWrapper); api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); - instrumentation.writer(&writer); - instrumentation.nextStep(0); + api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ + instrumentation, streamWrapper}; + instrumentation.updateMemoryUsage(memoryUsage); + instrumentation.nextStep(); outputStream.flush(); } std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - LOG_DEBUG(<< outputStream.str()); rapidjson::Document results; rapidjson::ParseResult ok(results.Parse(outputStream.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); BOOST_TEST_REQUIRE(results.IsArray() == true); - const auto& result{results[0]}; - BOOST_TEST_REQUIRE(result["job_id"].GetString() == jobId); - BOOST_TEST_REQUIRE(result["type"].GetString() == "analytics_memory_usage"); - BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() >= timeBefore); - BOOST_TEST_REQUIRE(result["timestamp"].GetInt64() <= timeAfter); + bool hasMemoryUsage{false}; + for (const auto& result : results.GetArray()) { + if (result.HasMember("analytics_memory_usage")) { + BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); + BOOST_TEST_REQUIRE( + result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= + timeBefore); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); + hasMemoryUsage = true; + } + } + BOOST_TEST_REQUIRE(hasMemoryUsage); } BOOST_AUTO_TEST_CASE(testTrainingRegression) { @@ -94,8 +75,9 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( + specFactory.predictionSpec( test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), outputWriterFactory}; test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( @@ -146,10 +128,13 @@ BOOST_AUTO_TEST_CASE(testTrainingClassification) { TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; api::CDataFrameAnalyzer analyzer{ - test::CDataFrameAnalysisSpecificationFactory::predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::classification(), - "target", 100, 5, 6000000, 0, 0, {"target"}), + specFactory.rows(100) + .memoryLimit(6000000) + .columns(5) + .predictionCategoricalFieldNames({"target"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), outputWriterFactory}; test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, From 01e933e23f4988777cc4e385686cf0dc506b4f19 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 12 Mar 2020 10:27:25 +0100 Subject: [PATCH 22/40] Fix unit test build errors --- .../CDataFrameAnalysisInstrumentationInterface.h | 2 +- lib/maths/unittest/CBoostedTreeTest.cc | 11 +++++++++-- lib/maths/unittest/COutliersTest.cc | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 951356406c..e64d14f8fe 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -115,7 +115,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface virtual void lossType(const std::string& lossType) = 0; //! List of \p lossValues of validation error for the given \p fold. virtual void lossValues(std::string fold, TDoubleVec&& lossValues) = 0; - //! \return Strucutre contains hyperparameters. + //! \return Structure contains hyperparameters. virtual SHyperparameters& hyperparameters() = 0; }; diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 2b10bf0cb7..3a8cb7fc84 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -47,7 +47,7 @@ using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatV namespace { -class CTestInstrumentation : public maths::CDataFrameAnalysisInstrumentationInterface { +class CTestInstrumentation : public maths::CDataFrameTrainBoostedTreeInstrumentationInterface { public: using TIntVec = std::vector; @@ -84,13 +84,20 @@ class CTestInstrumentation : public maths::CDataFrameAnalysisInstrumentationInte << ", high water mark = " << m_MaxMemoryUsage.load()); } - void nextStep(std::uint32_t) override {} + void nextStep(const std::string& /* phase */) override {} + void type(EStatsType /* type */) override {} + void iteration(std::size_t /* iteration */) override {} + void iterationTime(std::uint64_t /* delta */) override {} + void lossType(const std::string& /* lossType */) override {} + void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override {} + SHyperparameters& hyperparameters() override { return m_Hyperparamters; } private: std::atomic_int m_TotalFractionalProgress; TIntVec m_TenPercentProgressPoints; std::atomic m_MemoryUsage; std::atomic m_MaxMemoryUsage; + SHyperparameters m_Hyperparamters; }; template diff --git a/lib/maths/unittest/COutliersTest.cc b/lib/maths/unittest/COutliersTest.cc index 6e793f0680..c9ec0deace 100644 --- a/lib/maths/unittest/COutliersTest.cc +++ b/lib/maths/unittest/COutliersTest.cc @@ -42,7 +42,7 @@ using TPoint = maths::CDenseVector; using TPointVec = std::vector; using TFactoryFunc = std::function(const TPointVec&)>; -class CTestInstrumentation final : public maths::CDataFrameAnalysisInstrumentationInterface { +class CTestInstrumentation final : public maths::CDataFrameOutliersInstrumentationInterface { public: using TProgressCallbackOpt = boost::optional; using TMemoryUsageCallbackOpt = boost::optional; @@ -68,7 +68,7 @@ class CTestInstrumentation final : public maths::CDataFrameAnalysisInstrumentati m_MemoryUsageCallback = memoryUsageCallback; } - void nextStep(std::uint32_t /*uint32*/) override {} + void nextStep(const std::string& /*uint32*/) override {} private: TProgressCallbackOpt m_ProgressCallback; From 635e487110fd15744502180d324a11834ab4e826 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 12 Mar 2020 10:49:24 +0100 Subject: [PATCH 23/40] add todos --- lib/maths/CBoostedTreeImpl.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 43ff5b82a9..2ba68c9453 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -43,6 +43,7 @@ namespace { const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; const std::string HYPERPARAMETER_OPTIMIZATION_PHASE{"hyperparameter_optimization"}; const std::string TRAINING_FINAL_TREE_PHASE{"training_final_tree"}; +// TODO add isRegression() to the loss functions hierarchy instead of this constant const std::array REGRESSION_LOSSES{CMse::NAME}; //! \brief Record the memory used by a supplied object using the RAII idiom. From 7a42271da71b9b3211de70f64a24fe6be1af3e7e Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 12 Mar 2020 11:47:39 +0100 Subject: [PATCH 24/40] cleaning up --- include/api/CDataFrameAnalysisInstrumentation.h | 13 ++++++------- .../CDataFrameAnalysisInstrumentationInterface.h | 4 ++-- lib/api/CDataFrameAnalysisInstrumentation.cc | 7 +++++-- lib/maths/unittest/CBoostedTreeTest.cc | 11 +---------- lib/maths/unittest/COutliersTest.cc | 2 +- 5 files changed, 15 insertions(+), 22 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 8d57359c9c..e489c00f51 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -33,9 +33,6 @@ namespace api { class API_EXPORT CDataFrameAnalysisInstrumentation : virtual public maths::CDataFrameAnalysisInstrumentationInterface { public: - using TWriter = core::CRapidJsonConcurrentLineWriter; - using TWriterUPtr = std::unique_ptr; - //! \brief Set the output stream for the lifetime of this object. class API_EXPORT CScopeSetOutputStream { public: @@ -51,13 +48,13 @@ class API_EXPORT CDataFrameAnalysisInstrumentation }; public: - //! Constructs an intrumentation object an analytics job with a given \p jobId. + //! Constructs an instrumentation object an analytics job with a given \p jobId. explicit CDataFrameAnalysisInstrumentation(const std::string& jobId); //! Adds \p delta to the memory usage statistics. void updateMemoryUsage(std::int64_t delta) override; - //! This adds \p fractionalProgess to the current progress. + //! This adds \p fractionalProgress to the current progress. //! //! \note The caller should try to ensure that the sum of the values added //! at the end of the analysis is equal to one. @@ -91,11 +88,13 @@ class API_EXPORT CDataFrameAnalysisInstrumentation //! \return The id of the data frame analytics job. const std::string& jobId() const; - // TODO move to protected - TWriter* writer(); +protected: + using TWriter = core::CRapidJsonConcurrentLineWriter; + using TWriterUPtr = std::unique_ptr; protected: virtual counter_t::ECounterTypes memoryCounterType() = 0; + TWriter* writer(); private: void writeMemory(std::int64_t timestamp); diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index e64d14f8fe..acc176d4eb 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -120,7 +120,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface }; //! \brief Dummies out all instrumentation for outlier detection. -class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final +class MATHS_EXPORT CDataFrameOutliersInstrumentationStub : public CDataFrameOutliersInstrumentationInterface { public: void updateMemoryUsage(std::int64_t) override {} @@ -129,7 +129,7 @@ class MATHS_EXPORT CDataFrameOutliersInstrumentationStub final }; //! \brief Dummies out all instrumentation for supervised learning. -class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub final +class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub : public CDataFrameTrainBoostedTreeInstrumentationInterface { public: void updateMemoryUsage(std::int64_t) override {} diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 6eeb998616..11d537e241 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -194,10 +194,13 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t writer->String(this->jobId()); writer->Key(TIMESTAMP_TAG); writer->Int64(timestamp); - if (m_Type == E_Regression) { + switch (m_Type) { + case E_Regression: writer->Key(REGRESSION_STATS_TAG); - } else { + break; + case E_Classification: writer->Key(CLASSIFICATION_STATS_TAG); + break; } writer->StartObject(); writer->Key(ITERATION_TAG); diff --git a/lib/maths/unittest/CBoostedTreeTest.cc b/lib/maths/unittest/CBoostedTreeTest.cc index 3a8cb7fc84..0a7f864c20 100644 --- a/lib/maths/unittest/CBoostedTreeTest.cc +++ b/lib/maths/unittest/CBoostedTreeTest.cc @@ -47,7 +47,7 @@ using TMemoryMappedFloatVector = maths::boosted_tree::CLoss::TMemoryMappedFloatV namespace { -class CTestInstrumentation : public maths::CDataFrameTrainBoostedTreeInstrumentationInterface { +class CTestInstrumentation : public maths::CDataFrameTrainBoostedTreeInstrumentationStub { public: using TIntVec = std::vector; @@ -84,20 +84,11 @@ class CTestInstrumentation : public maths::CDataFrameTrainBoostedTreeInstrumenta << ", high water mark = " << m_MaxMemoryUsage.load()); } - void nextStep(const std::string& /* phase */) override {} - void type(EStatsType /* type */) override {} - void iteration(std::size_t /* iteration */) override {} - void iterationTime(std::uint64_t /* delta */) override {} - void lossType(const std::string& /* lossType */) override {} - void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override {} - SHyperparameters& hyperparameters() override { return m_Hyperparamters; } - private: std::atomic_int m_TotalFractionalProgress; TIntVec m_TenPercentProgressPoints; std::atomic m_MemoryUsage; std::atomic m_MaxMemoryUsage; - SHyperparameters m_Hyperparamters; }; template diff --git a/lib/maths/unittest/COutliersTest.cc b/lib/maths/unittest/COutliersTest.cc index c9ec0deace..776af37c07 100644 --- a/lib/maths/unittest/COutliersTest.cc +++ b/lib/maths/unittest/COutliersTest.cc @@ -42,7 +42,7 @@ using TPoint = maths::CDenseVector; using TPointVec = std::vector; using TFactoryFunc = std::function(const TPointVec&)>; -class CTestInstrumentation final : public maths::CDataFrameOutliersInstrumentationInterface { +class CTestInstrumentation final : public maths::CDataFrameOutliersInstrumentationStub { public: using TProgressCallbackOpt = boost::optional; using TMemoryUsageCallbackOpt = boost::optional; From cc55af5a6fd54044e3c26c25db47cb441479ffc4 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Thu, 12 Mar 2020 13:57:35 +0100 Subject: [PATCH 25/40] add debug output --- lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index f07645695f..f46cc70c8f 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -43,6 +43,7 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { outputStream.flush(); } std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + LOG_DEBUG(<< outputStream.str()); rapidjson::Document results; rapidjson::ParseResult ok(results.Parse(outputStream.str())); @@ -145,7 +146,6 @@ BOOST_AUTO_TEST_CASE(testTrainingClassification) { rapidjson::Document results; rapidjson::ParseResult ok(results.Parse(output.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); - LOG_DEBUG(<< output.str()); std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); From 3f691b49d80d30d1b6687968f478703daa9dbc29 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 13 Mar 2020 12:23:33 +0100 Subject: [PATCH 26/40] fix unit test errors --- .../api/CDataFrameAnalysisInstrumentation.h | 4 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 6 +- .../CDataFrameAnalysisInstrumentationTest.cc | 62 ++++++++++++++----- lib/maths/CBoostedTreeImpl.cc | 2 +- 4 files changed, 54 insertions(+), 20 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index e489c00f51..82977c87d7 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -153,7 +153,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final counter_t::ECounterTypes memoryCounterType() override; private: - using TLossMap = std::unordered_map; + using TLossVec = std::vector>; private: void writeAnalysisStats(std::int64_t timestamp) override; @@ -168,7 +168,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final std::uint64_t m_IterationTime; std::uint64_t m_ElapsedTime = 0; std::string m_LossType; - TLossMap m_LossValues; + TLossVec m_LossValues; SHyperparameters m_Hyperparameters; }; } diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 11d537e241..02ee30847a 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -183,7 +183,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossType(const std::string& loss void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::string fold, TDoubleVec&& lossValues) { - m_LossValues.emplace(std::make_pair(fold, lossValues)); + m_LossValues.emplace_back(std::move(fold), std::move(lossValues)); } void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { @@ -201,6 +201,10 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t case E_Classification: writer->Key(CLASSIFICATION_STATS_TAG); break; + default: + LOG_ERROR(<< "Supervised learning type unknown or not set."); + writer->EndObject(); + return; } writer->StartObject(); writer->Key(ITERATION_TAG); diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index f46cc70c8f..517cdae8d5 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -43,7 +43,6 @@ BOOST_AUTO_TEST_CASE(testMemoryState) { outputStream.flush(); } std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - LOG_DEBUG(<< outputStream.str()); rapidjson::Document results; rapidjson::ParseResult ok(results.Parse(outputStream.str())); @@ -91,32 +90,63 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { rapidjson::ParseResult ok(results.Parse(output.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); - std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); - std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document schemaDocument; - BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument schema(schemaDocument); - rapidjson::SchemaValidator validator(schema); + std::ifstream regressionSchemaFileStream( + "testfiles/instrumentation/supervised_learning_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); + std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document regressionSchemaDocument; + BOOST_REQUIRE_MESSAGE( + regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); + rapidjson::SchemaValidator regressionValidator(regressionSchema); for (const auto& result : results.GetArray()) { if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); - if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); + if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { rapidjson::StringBuffer sb; - validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); + LOG_ERROR(<< "Invalid keyword: " + << regressionValidator.GetInvalidSchemaKeyword()); sb.Clear(); - validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } + + std::ifstream memorySchemaFileStream( + "testfiles/instrumentation/memory_usage.schema.json"); + BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); + std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document memorySchemaDocument; + BOOST_REQUIRE_MESSAGE( + memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument memorySchema(memorySchemaDocument); + rapidjson::SchemaValidator memoryValidator(memorySchema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analytics_memory_usage")) { + BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); + if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { + rapidjson::StringBuffer sb; + memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " + << memoryValidator.GetInvalidSchemaKeyword()); + sb.Clear(); + memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); LOG_ERROR(<< "Invalid document: " << sb.GetString()); BOOST_FAIL("Schema validation failed"); } } } - // TODO add memory format test } BOOST_AUTO_TEST_CASE(testTrainingClassification) { diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 2ba68c9453..fddc1b6374 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -44,7 +44,7 @@ const double MINIMUM_SPLIT_REFRESH_INTERVAL{3.0}; const std::string HYPERPARAMETER_OPTIMIZATION_PHASE{"hyperparameter_optimization"}; const std::string TRAINING_FINAL_TREE_PHASE{"training_final_tree"}; // TODO add isRegression() to the loss functions hierarchy instead of this constant -const std::array REGRESSION_LOSSES{CMse::NAME}; +const std::array REGRESSION_LOSSES{"mse"}; //! \brief Record the memory used by a supplied object using the RAII idiom. class CScopeRecordMemoryUsage { From acbe0fd8fc470acb2639395292510506fb269022 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 13 Mar 2020 12:36:41 +0100 Subject: [PATCH 27/40] formatting --- .../unittest/CDataFrameAnalysisInstrumentationTest.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 517cdae8d5..49215a0ea2 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -119,15 +119,13 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { } } - std::ifstream memorySchemaFileStream( - "testfiles/instrumentation/memory_usage.schema.json"); + std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), - std::istreambuf_iterator()); + std::istreambuf_iterator()); rapidjson::Document memorySchemaDocument; - BOOST_REQUIRE_MESSAGE( - memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); + BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); rapidjson::SchemaDocument memorySchema(memorySchemaDocument); rapidjson::SchemaValidator memoryValidator(memorySchema); From 400695f8429842eb6e7ef4200c9d62290fe5d1e2 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Fri, 13 Mar 2020 14:14:26 +0100 Subject: [PATCH 28/40] fix after merge --- include/test/CDataFrameAnalyzerTrainingFactory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h index 5d7708f1dd..2419c68015 100644 --- a/include/test/CDataFrameAnalyzerTrainingFactory.h +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -67,7 +67,7 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { if (type == E_Regression) { loss = std::make_unique(); } else { - loss = std::make_unique(); + loss = std::make_unique(); } maths::CBoostedTreeFactory treeFactory{ From 614962c62c914d787510aa440361eab1a9fba728 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Tue, 17 Mar 2020 16:44:03 +0100 Subject: [PATCH 29/40] fix errors after merge --- .../test/CDataFrameAnalyzerTrainingFactory.h | 48 +++++++++++++------ lib/test/CDataFrameAnalyzerTrainingFactory.cc | 13 ++--- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/include/test/CDataFrameAnalyzerTrainingFactory.h b/include/test/CDataFrameAnalyzerTrainingFactory.h index 2419c68015..1f79bfd72a 100644 --- a/include/test/CDataFrameAnalyzerTrainingFactory.h +++ b/include/test/CDataFrameAnalyzerTrainingFactory.h @@ -27,7 +27,11 @@ namespace test { //! \brief Collection of helping methods to create regression and classification data for tests. class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { public: - enum EPredictionType { E_Regression, E_BinaryClassification }; + enum EPredictionType { + E_Regression, + E_BinaryClassification, + E_MulticlassClassification + }; using TStrVec = std::vector; using TDoubleVec = std::vector; using TDataFrameUPtr = std::unique_ptr; @@ -57,11 +61,19 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { rng.generateUniformSamples(-10.0, 10.0, weights.size() * numberExamples, regressors); TStrVec targets; - auto frame = type == E_Regression - ? setupLinearRegressionData(fieldNames, fieldValues, analyzer, - weights, regressors, targets) - : setupBinaryClassificationData(fieldNames, fieldValues, analyzer, - weights, regressors, targets); + auto frame = [&] { + switch (type) { + case E_Regression: + return setupLinearRegressionData(fieldNames, fieldValues, analyzer, + weights, regressors, targets); + case E_BinaryClassification: + return setupBinaryClassificationData(fieldNames, fieldValues, analyzer, + weights, regressors, targets); + case E_MulticlassClassification: + // TODO + return TDataFrameUPtr{}; + } + }(); std::unique_ptr loss; if (type == E_Regression) { @@ -107,10 +119,18 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { frame->readRows(1, [&](TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { - double prediction{(*row)[tree->columnHoldingPrediction()]}; - appendPrediction(*frame, weights.size(), prediction, - tree->probabilityAtWhichToAssignClassOne(), - expectedPredictions); + auto prediction = tree->readAndAdjustPrediction(*row); + switch (type) { + case E_Regression: + appendPrediction(*frame, weights.size(), prediction[0], expectedPredictions); + break; + case E_BinaryClassification: + appendPrediction(*frame, weights.size(), prediction[1], expectedPredictions); + break; + case E_MulticlassClassification: + // TODO. + break; + } } }); } @@ -133,13 +153,11 @@ class TEST_EXPORT CDataFrameAnalyzerTrainingFactory { using TRowItr = core::CDataFrame::TRowItr; private: - static void - appendPrediction(core::CDataFrame&, std::size_t, double prediction, double, TDoubleVec& predictions); + static void appendPrediction(core::CDataFrame&, std::size_t, double prediction, TDoubleVec& predictions); static void appendPrediction(core::CDataFrame& frame, - std::size_t columnHoldingPrediction, - double logOddsClass1, - double threshold, + std::size_t target, + double class1Score, TStrVec& predictions); }; } diff --git a/lib/test/CDataFrameAnalyzerTrainingFactory.cc b/lib/test/CDataFrameAnalyzerTrainingFactory.cc index e33945a8e1..fa28917bcd 100644 --- a/lib/test/CDataFrameAnalyzerTrainingFactory.cc +++ b/lib/test/CDataFrameAnalyzerTrainingFactory.cc @@ -12,20 +12,17 @@ namespace test { void CDataFrameAnalyzerTrainingFactory::appendPrediction(core::CDataFrame&, std::size_t, double prediction, - double, TDoubleVec& predictions) { predictions.push_back(prediction); } void CDataFrameAnalyzerTrainingFactory::appendPrediction(core::CDataFrame& frame, - std::size_t columnHoldingPrediction, - double logOddsClass1, - double threshold, + std::size_t target, + double class1Score, TStrVec& predictions) { - predictions.push_back( - maths::CTools::logisticFunction(logOddsClass1) < threshold - ? frame.categoricalColumnValues()[columnHoldingPrediction][0] - : frame.categoricalColumnValues()[columnHoldingPrediction][1]); + predictions.push_back(class1Score < 0.5 + ? frame.categoricalColumnValues()[target][0] + : frame.categoricalColumnValues()[target][1]); } CDataFrameAnalyzerTrainingFactory::TDataFrameUPtr From 491ee9e9fe08041ffa1a2557964a2c58dff3fff1 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Tue, 17 Mar 2020 21:59:25 +0100 Subject: [PATCH 30/40] disable outputting instrumentation data --- lib/api/CDataFrameAnalysisInstrumentation.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 02ee30847a..e6ad57b719 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -110,12 +110,12 @@ void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) void CDataFrameAnalysisInstrumentation::writeState() { std::int64_t timestamp{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; if (m_Writer != nullptr) { - m_Writer->StartObject(); - m_Writer->Key(MEMORY_TYPE_TAG); - this->writeMemory(timestamp); - m_Writer->Key(ANALYSIS_TYPE_TAG); - this->writeAnalysisStats(timestamp); - m_Writer->EndObject(); + // m_Writer->StartObject(); + // m_Writer->Key(MEMORY_TYPE_TAG); + // this->writeMemory(timestamp); + // m_Writer->Key(ANALYSIS_TYPE_TAG); + // this->writeAnalysisStats(timestamp); + // m_Writer->EndObject(); } } From 0a4276a1ee19e8ff8555d6d29ae6414d7ebd111d Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Tue, 17 Mar 2020 22:03:03 +0100 Subject: [PATCH 31/40] disable unit test for unstrumentation --- .../CDataFrameAnalysisInstrumentationTest.cc | 410 +++++++++--------- 1 file changed, 205 insertions(+), 205 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 49215a0ea2..ba9bb9e9f7 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -1,205 +1,205 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ -#include - -#include - -#include -#include -#include - -#include - -#include - -#include -#include -#include - -BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) - -using namespace ml; - -namespace { -using TStrVec = std::vector; -using TDoubleVec = std::vector; -} - -BOOST_AUTO_TEST_CASE(testMemoryState) { - std::string jobId{"testJob"}; - std::int64_t memoryUsage{1000}; - std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - std::stringstream outputStream; - { - core::CJsonOutputStreamWrapper streamWrapper(outputStream); - api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); - api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ - instrumentation, streamWrapper}; - instrumentation.updateMemoryUsage(memoryUsage); - instrumentation.nextStep(); - outputStream.flush(); - } - std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(outputStream.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - BOOST_TEST_REQUIRE(results.IsArray() == true); - - bool hasMemoryUsage{false}; - for (const auto& result : results.GetArray()) { - if (result.HasMember("analytics_memory_usage")) { - BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); - BOOST_TEST_REQUIRE( - result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= - timeBefore); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); - hasMemoryUsage = true; - } - } - BOOST_TEST_REQUIRE(hasMemoryUsage); -} - -BOOST_AUTO_TEST_CASE(testTrainingRegression) { - std::stringstream output; - auto outputWriterFactory = [&output]() { - return std::make_unique(output); - }; - - TDoubleVec expectedPredictions; - - TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - test::CDataFrameAnalysisSpecificationFactory specFactory; - api::CDataFrameAnalyzer analyzer{ - specFactory.predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), - outputWriterFactory}; - test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( - test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, - fieldValues, analyzer, expectedPredictions); - - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - - std::ifstream regressionSchemaFileStream( - "testfiles/instrumentation/supervised_learning_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); - std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document regressionSchemaDocument; - BOOST_REQUIRE_MESSAGE( - regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); - rapidjson::SchemaValidator regressionValidator(regressionSchema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); - if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { - rapidjson::StringBuffer sb; - regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " - << regressionValidator.GetInvalidSchemaKeyword()); - sb.Clear(); - regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } - - std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); - BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); - std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document memorySchemaDocument; - BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument memorySchema(memorySchemaDocument); - rapidjson::SchemaValidator memoryValidator(memorySchema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analytics_memory_usage")) { - BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); - if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { - rapidjson::StringBuffer sb; - memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " - << memoryValidator.GetInvalidSchemaKeyword()); - sb.Clear(); - memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } -} - -BOOST_AUTO_TEST_CASE(testTrainingClassification) { - std::stringstream output; - auto outputWriterFactory = [&output]() { - return std::make_unique(output); - }; - - TDoubleVec expectedPredictions; - - TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - test::CDataFrameAnalysisSpecificationFactory specFactory; - api::CDataFrameAnalyzer analyzer{ - specFactory.rows(100) - .memoryLimit(6000000) - .columns(5) - .predictionCategoricalFieldNames({"target"}) - .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), - outputWriterFactory}; - test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( - test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, - fieldNames, fieldValues, analyzer, expectedPredictions); - - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - - std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); - std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document schemaDocument; - BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument schema(schemaDocument); - rapidjson::SchemaValidator validator(schema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); - if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { - rapidjson::StringBuffer sb; - validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); - sb.Clear(); - validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } -} - -BOOST_AUTO_TEST_SUITE_END() +// /* +// * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// * or more contributor license agreements. Licensed under the Elastic License; +// * you may not use this file except in compliance with the Elastic License. +// */ +// #include + +// #include + +// #include +// #include +// #include + +// #include + +// #include + +// #include +// #include +// #include + +// BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) + +// using namespace ml; + +// namespace { +// using TStrVec = std::vector; +// using TDoubleVec = std::vector; +// } + +// BOOST_AUTO_TEST_CASE(testMemoryState) { +// std::string jobId{"testJob"}; +// std::int64_t memoryUsage{1000}; +// std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; +// std::stringstream outputStream; +// { +// core::CJsonOutputStreamWrapper streamWrapper(outputStream); +// api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); +// api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ +// instrumentation, streamWrapper}; +// instrumentation.updateMemoryUsage(memoryUsage); +// instrumentation.nextStep(); +// outputStream.flush(); +// } +// std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(outputStream.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); +// BOOST_TEST_REQUIRE(results.IsArray() == true); + +// bool hasMemoryUsage{false}; +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analytics_memory_usage")) { +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); +// BOOST_TEST_REQUIRE( +// result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= +// timeBefore); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); +// hasMemoryUsage = true; +// } +// } +// BOOST_TEST_REQUIRE(hasMemoryUsage); +// } + +// BOOST_AUTO_TEST_CASE(testTrainingRegression) { +// std::stringstream output; +// auto outputWriterFactory = [&output]() { +// return std::make_unique(output); +// }; + +// TDoubleVec expectedPredictions; + +// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; +// TStrVec fieldValues{"", "", "", "", "", "0", ""}; +// test::CDataFrameAnalysisSpecificationFactory specFactory; +// api::CDataFrameAnalyzer analyzer{ +// specFactory.predictionSpec( +// test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), +// outputWriterFactory}; +// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( +// test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, +// fieldValues, analyzer, expectedPredictions); + +// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(output.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); + +// std::ifstream regressionSchemaFileStream( +// "testfiles/instrumentation/supervised_learning_stats.schema.json"); +// BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); +// std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document regressionSchemaDocument; +// BOOST_REQUIRE_MESSAGE( +// regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); +// rapidjson::SchemaValidator regressionValidator(regressionSchema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analysis_stats")) { +// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); +// if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { +// rapidjson::StringBuffer sb; +// regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " +// << regressionValidator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } + +// std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); +// BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); +// std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document memorySchemaDocument; +// BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument memorySchema(memorySchemaDocument); +// rapidjson::SchemaValidator memoryValidator(memorySchema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analytics_memory_usage")) { +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); +// if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { +// rapidjson::StringBuffer sb; +// memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " +// << memoryValidator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } +// } + +// BOOST_AUTO_TEST_CASE(testTrainingClassification) { +// std::stringstream output; +// auto outputWriterFactory = [&output]() { +// return std::make_unique(output); +// }; + +// TDoubleVec expectedPredictions; + +// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; +// TStrVec fieldValues{"", "", "", "", "", "0", ""}; +// test::CDataFrameAnalysisSpecificationFactory specFactory; +// api::CDataFrameAnalyzer analyzer{ +// specFactory.rows(100) +// .memoryLimit(6000000) +// .columns(5) +// .predictionCategoricalFieldNames({"target"}) +// .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), +// outputWriterFactory}; +// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( +// test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, +// fieldNames, fieldValues, analyzer, expectedPredictions); + +// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(output.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); + +// std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); +// BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); +// std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document schemaDocument; +// BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument schema(schemaDocument); +// rapidjson::SchemaValidator validator(schema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analysis_stats")) { +// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); +// if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { +// rapidjson::StringBuffer sb; +// validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } +// } + +// BOOST_AUTO_TEST_SUITE_END() From 129030ed2781cdd93b727c21aa20dc2980fac2bc Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 13:03:02 +0100 Subject: [PATCH 32/40] license text --- .../unittest/CDataFrameAnalysisInstrumentationTest.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index ba9bb9e9f7..31254cb3c0 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -1,8 +1,8 @@ -// /* -// * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// * or more contributor license agreements. Licensed under the Elastic License; -// * you may not use this file except in compliance with the Elastic License. -// */ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ // #include // #include From 1987e02f60b0bbca1b5b93e92c8a8d35b116ca23 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 14:31:00 +0100 Subject: [PATCH 33/40] fix macosx compile errors --- ...ataFrameAnalysisInstrumentationInterface.h | 2 +- lib/api/CDataFrameAnalysisInstrumentation.cc | 43 +++--- .../analysis_stats.schema.json | 6 +- .../classification_stats.schema.json | 124 ++++++++++++++++++ .../outlier_detection_stats.schema.json | 15 ++- ...hema.json => regression_stats.schema.json} | 44 ++++--- 6 files changed, 189 insertions(+), 45 deletions(-) create mode 100644 lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json rename lib/api/unittest/testfiles/instrumentation/{supervised_learning_stats.schema.json => regression_stats.schema.json} (81%) diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index acc176d4eb..0cc050a181 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -80,7 +80,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface : s_DepthPenaltyMultiplier{depthPenaltyMultiplier}, s_SoftTreeDepthLimit{softTreeDepthLimit}, s_SoftTreeDepthTolerance{softTreeDepthTolerance}, s_TreeSizePenaltyMultiplier{treeSizePenaltyMultiplier}, - s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {}; + s_LeafWeightPenaltyMultiplier{leafWeightPenaltyMultiplier} {} double s_DepthPenaltyMultiplier = -1.0; double s_SoftTreeDepthLimit = -1.0; double s_SoftTreeDepthTolerance = -1.0; diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index e6ad57b719..6ee2e2c9c9 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -33,10 +33,11 @@ const std::string TIMING_ELAPSED_TIME_TAG{"elapsed_time"}; const std::string TIMING_ITERATION_TIME_TAG{"iteration_time"}; const std::string TIMING_STATS_TAG{"timing_stats"}; const std::string TYPE_TAG{"type"}; +const std::string VALIDATION_FOLD_TAG{"fold"}; +const std::string VALIDATION_FOLD_VALUES_TAG{"fold_values"}; const std::string VALIDATION_LOSS_TAG{"validation_loss"}; const std::string VALIDATION_LOSS_TYPE_TAG{"loss_type"}; const std::string VALIDATION_LOSS_VALUES_TAG{"values"}; -const std::string VALIDATION_NUM_FOLDS_TAG{"num_folds"}; // Hyperparameters const std::string CLASS_ASSIGNMENT_OBJECTIVE_TAG{"class_assignment_objective"}; @@ -278,12 +279,16 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson:: DOWNSAMPLE_FACTOR_TAG, rapidjson::Value(this->m_Hyperparameters.s_DownsampleFactor).Move(), parentObject); - writer->addMember(NUM_FOLDS_TAG, - rapidjson::Value(this->m_Hyperparameters.s_NumFolds).Move(), - parentObject); - writer->addMember(MAX_TREES_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxTrees).Move(), - parentObject); + writer->addMember( + NUM_FOLDS_TAG, + rapidjson::Value(static_cast(this->m_Hyperparameters.s_NumFolds)) + .Move(), + parentObject); + writer->addMember( + MAX_TREES_TAG, + rapidjson::Value(static_cast(this->m_Hyperparameters.s_MaxTrees)) + .Move(), + parentObject); writer->addMember( FEATURE_BAG_FRACTION_TAG, rapidjson::Value(this->m_Hyperparameters.s_FeatureBagFraction).Move(), @@ -294,32 +299,38 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeHyperparameters(rapidjson:: parentObject); writer->addMember( MAX_ATTEMPTS_TO_ADD_TREE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxAttemptsToAddTree).Move(), + rapidjson::Value(static_cast(this->m_Hyperparameters.s_MaxAttemptsToAddTree)) + .Move(), parentObject); writer->addMember( NUM_SPLITS_PER_FEATURE_TAG, - rapidjson::Value(this->m_Hyperparameters.s_NumSplitsPerFeature).Move(), + rapidjson::Value(static_cast(this->m_Hyperparameters.s_NumSplitsPerFeature)) + .Move(), + parentObject); + writer->addMember( + MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, + rapidjson::Value(static_cast(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter)) + .Move(), parentObject); - writer->addMember(MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER_TAG, - rapidjson::Value(this->m_Hyperparameters.s_MaxOptimizationRoundsPerHyperparameter) - .Move(), - parentObject); } } void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::Value& parentObject) { auto* writer = this->writer(); if (writer != nullptr) { writer->addMember(VALIDATION_LOSS_TYPE_TAG, m_LossType, parentObject); - rapidjson::Value lossValuesObject{writer->makeObject()}; + rapidjson::Value lossValuesArray{writer->makeArray()}; for (auto& element : m_LossValues) { + rapidjson::Value item{writer->makeObject()}; + writer->addMember(VALIDATION_FOLD_TAG, element.first, item); rapidjson::Value array{writer->makeArray(element.second.size())}; for (double lossValue : element.second) { array.PushBack(rapidjson::Value(lossValue).Move(), writer->getRawAllocator()); } - writer->addMember(element.first, array, lossValuesObject); + writer->addMember(VALIDATION_LOSS_VALUES_TAG, array, item); + lossValuesArray.PushBack(item, writer->getRawAllocator()); } - writer->addMember(VALIDATION_LOSS_VALUES_TAG, lossValuesObject, parentObject); + writer->addMember(VALIDATION_FOLD_VALUES_TAG, lossValuesArray, parentObject); } } void CDataFrameTrainBoostedTreeInstrumentation::writeTimingStats(rapidjson::Value& parentObject) { diff --git a/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json index 029bb8e93e..80adb2baaf 100644 --- a/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json +++ b/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json @@ -6,17 +6,17 @@ "properties": { "job_id": { "type": "string", - "description": "Data Frame Analytics Job ID. Populated by Java." + "description": "Data Frame Analytics Job ID." }, "timestamp": { "type": "integer", "description": "Milliseconds since Unix Epoch" }, "regression_stats": { - "$ref": "supervised_learning_stats.schema.json" + "$ref": "regression_stats.schema.json" }, "classification_stats": { - "$ref": "supervised_learning_stats.schema.json" + "$ref": "classification_stats.schema.json" }, "outlier_detection_stats": { "$ref": "outlier_detection_stats.schema.json" diff --git a/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json new file mode 100644 index 0000000000..a63011391c --- /dev/null +++ b/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json @@ -0,0 +1,124 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/supervised_learning_stats.schema.json", + "description": "Instrumentation data specific to the supervised learning jobs.", + "title": "classification_stats", + "type": "object", + "properties": { + "iteration": { + "type": "integer" + }, + "hyperparameters": { + "type": "object", + "properties": { + "eta": { + "type": "number" + }, + "class_assignment_objective": { + "type": "string", + "enum": [ + "accuracy", + "minimum_recall" + ] + }, + "regularization_depth_penalty_multiplier": { + "type": "number" + }, + "regularization_soft_tree_depth_limit": { + "type": "number" + }, + "regularization_soft_tree_depth_tolerance": { + "type": "number" + }, + "regularization_tree_size_penalty_multiplier": { + "type": "number" + }, + "regularization_leaf_weight_penalty_multiplier": { + "type": "number" + }, + "downsample_factor": { + "type": "number" + }, + "num_folds": { + "type": "integer" + }, + "max_trees": { + "type": "integer" + }, + "feature_bag_fraction": { + "type": "number" + }, + "eta_growth_rate_per_tree": { + "type": "number" + }, + "max_attempts_to_add_tree": { + "type": "integer" + }, + "num_splits_per_feature": { + "type": "integer" + }, + "max_optimization_rounds_per_hyperparameter": { + "type": "integer" + } + } + }, + "validation_loss": { + "type": "object", + "properties": { + "loss_type": { + "description": "Loss metric name", + "type": "string", + "enum": [ + "binomial_logistic" + ] + }, + "fold_values": { + "description": "Validation loss values for every added decision tree during forest growing procedure", + "type": "array", + "items": { + "type": "object", + "properties": { + "fold": { + "type": "integer" + }, + "values": { + "type": "array", + "items": { + "type": "number" + } + } + }, + "additionalProperties": false + } + } + }, + "additionalProperties": false, + "required": [ + "loss_type", + "fold_values" + ] + }, + "timing_stats": { + "type": "object", + "properties": { + "elapsed_time": { + "description": "Job runtime so far in ms.", + "type": "integer" + }, + "iteration_time": { + "description": "Runtime of the last iteration in ms.", + "type": "integer" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false, + "required": [ + "iteration", + "hyperparameters", + "validation_loss", + "timing_stats" + ] + } + \ No newline at end of file diff --git a/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json index 3bcab8c632..1ef5558ad3 100644 --- a/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json +++ b/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json @@ -9,7 +9,7 @@ "type": "object", "description": "List of job parameters specified by user or determined by algorithmic heuristics", "properties": { - "n_neighbours": { + "n_neighbors": { "description": "Defines the value for how many nearest neighbors each method of outlier detection will use to calculate its outlier score.", "type": "integer" }, @@ -42,14 +42,19 @@ }, "additionalProperties": false }, - "elapsed_time": { - "description": "Job runtime so far in ms.", - "type": "number" + "timing_stats": { + "type": "object", + "properties": { + "elapsed_time": { + "description": "Job runtime so far in ms.", + "type": "integer" + } + } } }, "required": [ "parameters", - "elapsed_time" + "timing_stats" ], "additionalProperties": false } diff --git a/lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json similarity index 81% rename from lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json rename to lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json index 53b8383bb8..2a90566561 100644 --- a/lib/api/unittest/testfiles/instrumentation/supervised_learning_stats.schema.json +++ b/lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json @@ -2,15 +2,7 @@ "$schema": "http://json-schema.org/draft-04/schema#", "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/supervised_learning_stats.schema.json", "description": "Instrumentation data specific to the supervised learning jobs.", - "title": "supervised_learning_stats", - "definitions": { - "loss_values": { - "type": "array", - "items": { - "type": "number" - } - } - }, + "title": "regression_stats", "type": "object", "properties": { "iteration": { @@ -22,10 +14,6 @@ "eta": { "type": "number" }, - "class_assignment_objective": { - "type": "string", - "enum": ["accuracy", "minimum_recall"] - }, "regularization_depth_penalty_multiplier": { "type": "number" }, @@ -73,19 +61,35 @@ "loss_type": { "description": "Loss metric name", "type": "string", - "enum": ["mse", "binomial_logistic"] - + "enum": [ + "mse" + ] }, - "values": { + "fold_values": { "description": "Validation loss values for every added decision tree during forest growing procedure", - "type": "object", - "additionalProperties": { - "$ref": "#/definitions/loss_values" + "type": "array", + "items": { + "type": "object", + "properties": { + "fold": { + "type": "integer" + }, + "values": { + "type": "array", + "items": { + "type": "number" + } + } + }, + "additionalProperties": false } } }, "additionalProperties": false, - "required": ["loss_type", "values"] + "required": [ + "loss_type", + "fold_values" + ] }, "timing_stats": { "type": "object", From 341c60815f377f601cfd588fee55d516c5bbf080 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 14:36:11 +0100 Subject: [PATCH 34/40] writestate uncommented --- lib/api/CDataFrameAnalysisInstrumentation.cc | 12 +- .../CDataFrameAnalysisInstrumentationTest.cc | 400 +++++++++--------- 2 files changed, 206 insertions(+), 206 deletions(-) diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 6ee2e2c9c9..d565ae983b 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -111,12 +111,12 @@ void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) void CDataFrameAnalysisInstrumentation::writeState() { std::int64_t timestamp{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; if (m_Writer != nullptr) { - // m_Writer->StartObject(); - // m_Writer->Key(MEMORY_TYPE_TAG); - // this->writeMemory(timestamp); - // m_Writer->Key(ANALYSIS_TYPE_TAG); - // this->writeAnalysisStats(timestamp); - // m_Writer->EndObject(); + m_Writer->StartObject(); + m_Writer->Key(MEMORY_TYPE_TAG); + this->writeMemory(timestamp); + m_Writer->Key(ANALYSIS_TYPE_TAG); + this->writeAnalysisStats(timestamp); + m_Writer->EndObject(); } } diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 31254cb3c0..49215a0ea2 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -3,203 +3,203 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -// #include - -// #include - -// #include -// #include -// #include - -// #include - -// #include - -// #include -// #include -// #include - -// BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) - -// using namespace ml; - -// namespace { -// using TStrVec = std::vector; -// using TDoubleVec = std::vector; -// } - -// BOOST_AUTO_TEST_CASE(testMemoryState) { -// std::string jobId{"testJob"}; -// std::int64_t memoryUsage{1000}; -// std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; -// std::stringstream outputStream; -// { -// core::CJsonOutputStreamWrapper streamWrapper(outputStream); -// api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); -// api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ -// instrumentation, streamWrapper}; -// instrumentation.updateMemoryUsage(memoryUsage); -// instrumentation.nextStep(); -// outputStream.flush(); -// } -// std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - -// rapidjson::Document results; -// rapidjson::ParseResult ok(results.Parse(outputStream.str())); -// BOOST_TEST_REQUIRE(static_cast(ok) == true); -// BOOST_TEST_REQUIRE(results.IsArray() == true); - -// bool hasMemoryUsage{false}; -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analytics_memory_usage")) { -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); -// BOOST_TEST_REQUIRE( -// result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= -// timeBefore); -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); -// hasMemoryUsage = true; -// } -// } -// BOOST_TEST_REQUIRE(hasMemoryUsage); -// } - -// BOOST_AUTO_TEST_CASE(testTrainingRegression) { -// std::stringstream output; -// auto outputWriterFactory = [&output]() { -// return std::make_unique(output); -// }; - -// TDoubleVec expectedPredictions; - -// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; -// TStrVec fieldValues{"", "", "", "", "", "0", ""}; -// test::CDataFrameAnalysisSpecificationFactory specFactory; -// api::CDataFrameAnalyzer analyzer{ -// specFactory.predictionSpec( -// test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), -// outputWriterFactory}; -// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( -// test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, -// fieldValues, analyzer, expectedPredictions); - -// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - -// rapidjson::Document results; -// rapidjson::ParseResult ok(results.Parse(output.str())); -// BOOST_TEST_REQUIRE(static_cast(ok) == true); - -// std::ifstream regressionSchemaFileStream( -// "testfiles/instrumentation/supervised_learning_stats.schema.json"); -// BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); -// std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), -// std::istreambuf_iterator()); -// rapidjson::Document regressionSchemaDocument; -// BOOST_REQUIRE_MESSAGE( -// regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, -// "Cannot parse JSON schema!"); -// rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); -// rapidjson::SchemaValidator regressionValidator(regressionSchema); - -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analysis_stats")) { -// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); -// if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { -// rapidjson::StringBuffer sb; -// regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); -// LOG_ERROR(<< "Invalid keyword: " -// << regressionValidator.GetInvalidSchemaKeyword()); -// sb.Clear(); -// regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid document: " << sb.GetString()); -// BOOST_FAIL("Schema validation failed"); -// } -// } -// } - -// std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); -// BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); -// std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), -// std::istreambuf_iterator()); -// rapidjson::Document memorySchemaDocument; -// BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, -// "Cannot parse JSON schema!"); -// rapidjson::SchemaDocument memorySchema(memorySchemaDocument); -// rapidjson::SchemaValidator memoryValidator(memorySchema); - -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analytics_memory_usage")) { -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); -// if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { -// rapidjson::StringBuffer sb; -// memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); -// LOG_ERROR(<< "Invalid keyword: " -// << memoryValidator.GetInvalidSchemaKeyword()); -// sb.Clear(); -// memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid document: " << sb.GetString()); -// BOOST_FAIL("Schema validation failed"); -// } -// } -// } -// } - -// BOOST_AUTO_TEST_CASE(testTrainingClassification) { -// std::stringstream output; -// auto outputWriterFactory = [&output]() { -// return std::make_unique(output); -// }; - -// TDoubleVec expectedPredictions; - -// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; -// TStrVec fieldValues{"", "", "", "", "", "0", ""}; -// test::CDataFrameAnalysisSpecificationFactory specFactory; -// api::CDataFrameAnalyzer analyzer{ -// specFactory.rows(100) -// .memoryLimit(6000000) -// .columns(5) -// .predictionCategoricalFieldNames({"target"}) -// .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), -// outputWriterFactory}; -// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( -// test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, -// fieldNames, fieldValues, analyzer, expectedPredictions); - -// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - -// rapidjson::Document results; -// rapidjson::ParseResult ok(results.Parse(output.str())); -// BOOST_TEST_REQUIRE(static_cast(ok) == true); - -// std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); -// BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); -// std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), -// std::istreambuf_iterator()); -// rapidjson::Document schemaDocument; -// BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, -// "Cannot parse JSON schema!"); -// rapidjson::SchemaDocument schema(schemaDocument); -// rapidjson::SchemaValidator validator(schema); - -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analysis_stats")) { -// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); -// if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { -// rapidjson::StringBuffer sb; -// validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); -// LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); -// sb.Clear(); -// validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid document: " << sb.GetString()); -// BOOST_FAIL("Schema validation failed"); -// } -// } -// } -// } - -// BOOST_AUTO_TEST_SUITE_END() +#include + +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include + +BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) + +using namespace ml; + +namespace { +using TStrVec = std::vector; +using TDoubleVec = std::vector; +} + +BOOST_AUTO_TEST_CASE(testMemoryState) { + std::string jobId{"testJob"}; + std::int64_t memoryUsage{1000}; + std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + std::stringstream outputStream; + { + core::CJsonOutputStreamWrapper streamWrapper(outputStream); + api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); + api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ + instrumentation, streamWrapper}; + instrumentation.updateMemoryUsage(memoryUsage); + instrumentation.nextStep(); + outputStream.flush(); + } + std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(outputStream.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + BOOST_TEST_REQUIRE(results.IsArray() == true); + + bool hasMemoryUsage{false}; + for (const auto& result : results.GetArray()) { + if (result.HasMember("analytics_memory_usage")) { + BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); + BOOST_TEST_REQUIRE( + result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= + timeBefore); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); + hasMemoryUsage = true; + } + } + BOOST_TEST_REQUIRE(hasMemoryUsage); +} + +BOOST_AUTO_TEST_CASE(testTrainingRegression) { + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), + outputWriterFactory}; + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, + fieldValues, analyzer, expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + + std::ifstream regressionSchemaFileStream( + "testfiles/instrumentation/supervised_learning_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); + std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document regressionSchemaDocument; + BOOST_REQUIRE_MESSAGE( + regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); + rapidjson::SchemaValidator regressionValidator(regressionSchema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analysis_stats")) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); + if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { + rapidjson::StringBuffer sb; + regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " + << regressionValidator.GetInvalidSchemaKeyword()); + sb.Clear(); + regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } + + std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); + BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); + std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document memorySchemaDocument; + BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument memorySchema(memorySchemaDocument); + rapidjson::SchemaValidator memoryValidator(memorySchema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analytics_memory_usage")) { + BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); + if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { + rapidjson::StringBuffer sb; + memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " + << memoryValidator.GetInvalidSchemaKeyword()); + sb.Clear(); + memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } +} + +BOOST_AUTO_TEST_CASE(testTrainingClassification) { + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.rows(100) + .memoryLimit(6000000) + .columns(5) + .predictionCategoricalFieldNames({"target"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), + outputWriterFactory}; + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, + fieldNames, fieldValues, analyzer, expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + + std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); + std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document schemaDocument; + BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument schema(schemaDocument); + rapidjson::SchemaValidator validator(schema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analysis_stats")) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); + if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { + rapidjson::StringBuffer sb; + validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); + sb.Clear(); + validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } +} + +BOOST_AUTO_TEST_SUITE_END() From fa2c7cbd060aec88ae438c0995f2200f66f9ac92 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 14:56:23 +0100 Subject: [PATCH 35/40] Fix validation loss schema --- include/api/CDataFrameAnalysisInstrumentation.h | 5 +++-- include/maths/CDataFrameAnalysisInstrumentationInterface.h | 5 +++-- lib/api/CDataFrameAnalysisInstrumentation.cc | 6 ++++-- lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc | 5 ++--- lib/maths/CBoostedTreeImpl.cc | 2 +- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index 82977c87d7..f5108bdc05 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_api_CDataFrameAnalysisInstrumentation_h #define INCLUDED_ml_api_CDataFrameAnalysisInstrumentation_h +#include #include #include @@ -145,7 +146,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final //! Type of the validation loss result, e.g. "mse". void lossType(const std::string& lossType) override; //! List of \p lossValues of validation error for the given \p fold. - void lossValues(std::string fold, TDoubleVec&& lossValues) override; + void lossValues(std::size_t fold, TDoubleVec&& lossValues) override; //! \return Structure contains hyperparameters. SHyperparameters& hyperparameters() override { return m_Hyperparameters; } @@ -153,7 +154,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeInstrumentation final counter_t::ECounterTypes memoryCounterType() override; private: - using TLossVec = std::vector>; + using TLossVec = std::vector>; private: void writeAnalysisStats(std::int64_t timestamp) override; diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 0cc050a181..95d785e466 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h #define INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h +#include #include #include @@ -114,7 +115,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationInterface //! Type of the validation loss result, e.g. "mse". virtual void lossType(const std::string& lossType) = 0; //! List of \p lossValues of validation error for the given \p fold. - virtual void lossValues(std::string fold, TDoubleVec&& lossValues) = 0; + virtual void lossValues(std::size_t fold, TDoubleVec&& lossValues) = 0; //! \return Structure contains hyperparameters. virtual SHyperparameters& hyperparameters() = 0; }; @@ -139,7 +140,7 @@ class MATHS_EXPORT CDataFrameTrainBoostedTreeInstrumentationStub void iteration(std::size_t /* iteration */) override {} void iterationTime(std::uint64_t /* delta */) override {} void lossType(const std::string& /* lossType */) override {} - void lossValues(std::string /* fold */, TDoubleVec&& /* lossValues */) override {} + void lossValues(std::size_t /* fold */, TDoubleVec&& /* lossValues */) override {} SHyperparameters& hyperparameters() override { return m_Hyperparameters; } private: diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index d565ae983b..922a5f23a1 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -182,7 +182,7 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossType(const std::string& loss m_LossType = lossType; } -void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::string fold, +void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::size_t fold, TDoubleVec&& lossValues) { m_LossValues.emplace_back(std::move(fold), std::move(lossValues)); } @@ -321,7 +321,9 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeValidationLoss(rapidjson::V rapidjson::Value lossValuesArray{writer->makeArray()}; for (auto& element : m_LossValues) { rapidjson::Value item{writer->makeObject()}; - writer->addMember(VALIDATION_FOLD_TAG, element.first, item); + writer->addMember( + VALIDATION_FOLD_TAG, + rapidjson::Value(static_cast(element.first)).Move(), item); rapidjson::Value array{writer->makeArray(element.second.size())}; for (double lossValue : element.second) { array.PushBack(rapidjson::Value(lossValue).Move(), diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index 49215a0ea2..d9463a50dd 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -90,8 +90,7 @@ BOOST_AUTO_TEST_CASE(testTrainingRegression) { rapidjson::ParseResult ok(results.Parse(output.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); - std::ifstream regressionSchemaFileStream( - "testfiles/instrumentation/supervised_learning_stats.schema.json"); + std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), std::istreambuf_iterator()); @@ -175,7 +174,7 @@ BOOST_AUTO_TEST_CASE(testTrainingClassification) { rapidjson::ParseResult ok(results.Parse(output.str())); BOOST_TEST_REQUIRE(static_cast(ok) == true); - std::ifstream schemaFileStream("testfiles/instrumentation/supervised_learning_stats.schema.json"); + std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), std::istreambuf_iterator()); diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 0b72aed129..0e92675c08 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -458,7 +458,7 @@ CBoostedTreeImpl::crossValidateForest(core::CDataFrame& frame) { lossMoments.add(loss); m_FoldRoundTestLosses[fold][m_CurrentRound] = loss; numberTrees.push_back(static_cast(forest.size())); - m_Instrumentation->lossValues(std::to_string(fold), std::move(lossValues)); + m_Instrumentation->lossValues(fold, std::move(lossValues)); } m_TrainingProgress.increment(m_MaximumNumberTrees * folds.size()); LOG_TRACE(<< "skipped " << folds.size() << " folds"); From 36470c31118a78b62732b61628187aedc8034ab3 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 15:16:39 +0100 Subject: [PATCH 36/40] deactivate writing state and unit test --- lib/api/CDataFrameAnalysisInstrumentation.cc | 3 +- .../CDataFrameAnalysisInstrumentationTest.cc | 398 +++++++++--------- 2 files changed, 201 insertions(+), 200 deletions(-) diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 922a5f23a1..ae9f55bb24 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -105,7 +105,8 @@ void CDataFrameAnalysisInstrumentation::resetProgress() { } void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) { - this->writeState(); + // TODO reactivate once Java part is ready + // this->writeState(); } void CDataFrameAnalysisInstrumentation::writeState() { diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index d9463a50dd..c7d807eeb2 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -3,202 +3,202 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -#include - -#include - -#include -#include -#include - -#include - -#include - -#include -#include -#include - -BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) - -using namespace ml; - -namespace { -using TStrVec = std::vector; -using TDoubleVec = std::vector; -} - -BOOST_AUTO_TEST_CASE(testMemoryState) { - std::string jobId{"testJob"}; - std::int64_t memoryUsage{1000}; - std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - std::stringstream outputStream; - { - core::CJsonOutputStreamWrapper streamWrapper(outputStream); - api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); - api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ - instrumentation, streamWrapper}; - instrumentation.updateMemoryUsage(memoryUsage); - instrumentation.nextStep(); - outputStream.flush(); - } - std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(outputStream.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - BOOST_TEST_REQUIRE(results.IsArray() == true); - - bool hasMemoryUsage{false}; - for (const auto& result : results.GetArray()) { - if (result.HasMember("analytics_memory_usage")) { - BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); - BOOST_TEST_REQUIRE( - result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= - timeBefore); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); - hasMemoryUsage = true; - } - } - BOOST_TEST_REQUIRE(hasMemoryUsage); -} - -BOOST_AUTO_TEST_CASE(testTrainingRegression) { - std::stringstream output; - auto outputWriterFactory = [&output]() { - return std::make_unique(output); - }; - - TDoubleVec expectedPredictions; - - TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - test::CDataFrameAnalysisSpecificationFactory specFactory; - api::CDataFrameAnalyzer analyzer{ - specFactory.predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), - outputWriterFactory}; - test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( - test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, - fieldValues, analyzer, expectedPredictions); - - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - - std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); - std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document regressionSchemaDocument; - BOOST_REQUIRE_MESSAGE( - regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); - rapidjson::SchemaValidator regressionValidator(regressionSchema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); - if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { - rapidjson::StringBuffer sb; - regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " - << regressionValidator.GetInvalidSchemaKeyword()); - sb.Clear(); - regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } - - std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); - BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); - std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document memorySchemaDocument; - BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument memorySchema(memorySchemaDocument); - rapidjson::SchemaValidator memoryValidator(memorySchema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analytics_memory_usage")) { - BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); - if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { - rapidjson::StringBuffer sb; - memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " - << memoryValidator.GetInvalidSchemaKeyword()); - sb.Clear(); - memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } -} - -BOOST_AUTO_TEST_CASE(testTrainingClassification) { - std::stringstream output; - auto outputWriterFactory = [&output]() { - return std::make_unique(output); - }; - - TDoubleVec expectedPredictions; - - TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - test::CDataFrameAnalysisSpecificationFactory specFactory; - api::CDataFrameAnalyzer analyzer{ - specFactory.rows(100) - .memoryLimit(6000000) - .columns(5) - .predictionCategoricalFieldNames({"target"}) - .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), - outputWriterFactory}; - test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( - test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, - fieldNames, fieldValues, analyzer, expectedPredictions); - - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - - std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); - std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document schemaDocument; - BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument schema(schemaDocument); - rapidjson::SchemaValidator validator(schema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); - if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { - rapidjson::StringBuffer sb; - validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); - sb.Clear(); - validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } -} - -BOOST_AUTO_TEST_SUITE_END() +// #include + +// #include + +// #include +// #include +// #include + +// #include + +// #include + +// #include +// #include +// #include + +// BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) + +// using namespace ml; + +// namespace { +// using TStrVec = std::vector; +// using TDoubleVec = std::vector; +// } + +// BOOST_AUTO_TEST_CASE(testMemoryState) { +// std::string jobId{"testJob"}; +// std::int64_t memoryUsage{1000}; +// std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; +// std::stringstream outputStream; +// { +// core::CJsonOutputStreamWrapper streamWrapper(outputStream); +// api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); +// api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ +// instrumentation, streamWrapper}; +// instrumentation.updateMemoryUsage(memoryUsage); +// instrumentation.nextStep(); +// outputStream.flush(); +// } +// std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(outputStream.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); +// BOOST_TEST_REQUIRE(results.IsArray() == true); + +// bool hasMemoryUsage{false}; +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analytics_memory_usage")) { +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); +// BOOST_TEST_REQUIRE( +// result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= +// timeBefore); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); +// hasMemoryUsage = true; +// } +// } +// BOOST_TEST_REQUIRE(hasMemoryUsage); +// } + +// BOOST_AUTO_TEST_CASE(testTrainingRegression) { +// std::stringstream output; +// auto outputWriterFactory = [&output]() { +// return std::make_unique(output); +// }; + +// TDoubleVec expectedPredictions; + +// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; +// TStrVec fieldValues{"", "", "", "", "", "0", ""}; +// test::CDataFrameAnalysisSpecificationFactory specFactory; +// api::CDataFrameAnalyzer analyzer{ +// specFactory.predictionSpec( +// test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), +// outputWriterFactory}; +// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( +// test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, +// fieldValues, analyzer, expectedPredictions); + +// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(output.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); + +// std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); +// BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); +// std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document regressionSchemaDocument; +// BOOST_REQUIRE_MESSAGE( +// regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); +// rapidjson::SchemaValidator regressionValidator(regressionSchema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analysis_stats")) { +// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); +// if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { +// rapidjson::StringBuffer sb; +// regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " +// << regressionValidator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } + +// std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); +// BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); +// std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document memorySchemaDocument; +// BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument memorySchema(memorySchemaDocument); +// rapidjson::SchemaValidator memoryValidator(memorySchema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analytics_memory_usage")) { +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); +// if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { +// rapidjson::StringBuffer sb; +// memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " +// << memoryValidator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } +// } + +// BOOST_AUTO_TEST_CASE(testTrainingClassification) { +// std::stringstream output; +// auto outputWriterFactory = [&output]() { +// return std::make_unique(output); +// }; + +// TDoubleVec expectedPredictions; + +// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; +// TStrVec fieldValues{"", "", "", "", "", "0", ""}; +// test::CDataFrameAnalysisSpecificationFactory specFactory; +// api::CDataFrameAnalyzer analyzer{ +// specFactory.rows(100) +// .memoryLimit(6000000) +// .columns(5) +// .predictionCategoricalFieldNames({"target"}) +// .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), +// outputWriterFactory}; +// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( +// test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, +// fieldNames, fieldValues, analyzer, expectedPredictions); + +// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(output.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); + +// std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); +// BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); +// std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document schemaDocument; +// BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument schema(schemaDocument); +// rapidjson::SchemaValidator validator(schema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analysis_stats")) { +// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); +// if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { +// rapidjson::StringBuffer sb; +// validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } +// } + +// BOOST_AUTO_TEST_SUITE_END() From 57447f9ef62c4372e3d8fb72a69982d60f98d07a Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 16:24:52 +0100 Subject: [PATCH 37/40] fix header --- include/maths/CDataFrameAnalysisInstrumentationInterface.h | 1 - lib/maths/CBoostedTreeImpl.cc | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/maths/CDataFrameAnalysisInstrumentationInterface.h b/include/maths/CDataFrameAnalysisInstrumentationInterface.h index 95d785e466..ec231b9253 100644 --- a/include/maths/CDataFrameAnalysisInstrumentationInterface.h +++ b/include/maths/CDataFrameAnalysisInstrumentationInterface.h @@ -7,7 +7,6 @@ #ifndef INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h #define INCLUDED_ml_maths_CDataFrameAnalysisInstrumentationInterface_h -#include #include #include diff --git a/lib/maths/CBoostedTreeImpl.cc b/lib/maths/CBoostedTreeImpl.cc index 0e92675c08..8bb2c0b735 100644 --- a/lib/maths/CBoostedTreeImpl.cc +++ b/lib/maths/CBoostedTreeImpl.cc @@ -248,9 +248,9 @@ void CBoostedTreeImpl::train(core::CDataFrame& frame, this->restoreBestHyperparameters(); std::tie(m_BestForest, std::ignore, std::ignore) = this->trainForest( frame, allTrainingRowsMask, allTrainingRowsMask, m_TrainingProgress); + this->recordState(recordTrainStateCallback); m_Instrumentation->iteration(m_CurrentRound); m_Instrumentation->nextStep(TRAINING_FINAL_TREE_PHASE); - this->recordState(recordTrainStateCallback); timeAccumulator.add(static_cast(stopWatch.stop())); From 0216f5bf9b23bf57b3eed2a71599fbb8ae4d804d Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 16:35:48 +0100 Subject: [PATCH 38/40] fix include error --- include/api/CDataFrameAnalysisInstrumentation.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/api/CDataFrameAnalysisInstrumentation.h b/include/api/CDataFrameAnalysisInstrumentation.h index f5108bdc05..64d52cfb9c 100644 --- a/include/api/CDataFrameAnalysisInstrumentation.h +++ b/include/api/CDataFrameAnalysisInstrumentation.h @@ -7,7 +7,6 @@ #ifndef INCLUDED_ml_api_CDataFrameAnalysisInstrumentation_h #define INCLUDED_ml_api_CDataFrameAnalysisInstrumentation_h -#include #include #include From 6134faad9c97e26972ff8b95cb9fcb65629b1c13 Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 16:50:37 +0100 Subject: [PATCH 39/40] remote analysis_stats as key --- lib/api/CDataFrameAnalysisInstrumentation.cc | 26 +- .../CDataFrameAnalysisInstrumentationTest.cc | 398 +++++++++--------- .../analysis_stats.schema.json | 43 -- .../classification_stats.schema.json | 245 +++++------ .../outlier_detection_stats.schema.json | 10 + .../regression_stats.schema.json | 10 + 6 files changed, 362 insertions(+), 370 deletions(-) delete mode 100644 lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index ae9f55bb24..497f13c946 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -18,12 +18,12 @@ namespace api { namespace { // clang-format off -const std::string ANALYSIS_TYPE_TAG{"analysis_stats"}; const std::string CLASSIFICATION_STATS_TAG{"classification_stats"}; const std::string HYPERPARAMETERS_TAG{"hyperparameters"}; const std::string ITERATION_TAG{"iteration"}; const std::string JOB_ID_TAG{"job_id"}; const std::string MEMORY_TYPE_TAG{"analytics_memory_usage"}; +const std::string OUTLIER_DETECTION_STATS{"outlier_detection_stats"}; const std::string PEAK_MEMORY_USAGE_TAG{"peak_usage_bytes"}; const std::string PROGRESS_TAG{"progress"}; const std::string REGRESSION_STATS_TAG{"regression_stats"}; @@ -56,6 +56,15 @@ const std::string REGULARIZATION_LEAF_WEIGHT_PENALTY_MULTIPLIER_TAG{"regularizat const std::string REGULARIZATION_SOFT_TREE_DEPTH_LIMIT_TAG{"regularization_soft_tree_depth_limit"}; const std::string REGULARIZATION_SOFT_TREE_DEPTH_TOLERANCE_TAG{"regularization_soft_tree_depth_tolerance"}; const std::string REGULARIZATION_TREE_SIZE_PENALTY_MULTIPLIER_TAG{"regularization_tree_size_penalty_multiplier"}; + +// Outlier detection parameters +const std::string N_NEIGHBORS{"n_neighbors"}; +const std::string METHODS{"methods"}; +const std::string COMPUTE_FEATURE_INFLUENCE{"compute_feature_influence"}; +const std::string FEATURE_INFLUENCE_THRESHOLD{"feature_influence_threshold"}; +const std::string OUTLIER_FRACTION{"outlier_fraction"}; +const std::string STANDARDIZATION_ENABLED{"standardization_enabled"}; + // clang-format on const std::size_t MAXIMUM_FRACTIONAL_PROGRESS{std::size_t{1} @@ -106,7 +115,7 @@ void CDataFrameAnalysisInstrumentation::resetProgress() { void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) { // TODO reactivate once Java part is ready - // this->writeState(); + this->writeState(); } void CDataFrameAnalysisInstrumentation::writeState() { @@ -115,7 +124,6 @@ void CDataFrameAnalysisInstrumentation::writeState() { m_Writer->StartObject(); m_Writer->Key(MEMORY_TYPE_TAG); this->writeMemory(timestamp); - m_Writer->Key(ANALYSIS_TYPE_TAG); this->writeAnalysisStats(timestamp); m_Writer->EndObject(); } @@ -157,6 +165,7 @@ counter_t::ECounterTypes CDataFrameTrainBoostedTreeInstrumentation::memoryCounte void CDataFrameOutliersInstrumentation::writeAnalysisStats(std::int64_t timestamp) { auto writer = this->writer(); if (writer != nullptr) { + writer->Key(OUTLIER_DETECTION_STATS); writer->StartObject(); writer->Key(JOB_ID_TAG); writer->String(this->jobId()); @@ -191,11 +200,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::lossValues(std::size_t fold, void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t timestamp) { auto* writer = this->writer(); if (writer != nullptr) { - writer->StartObject(); - writer->Key(JOB_ID_TAG); - writer->String(this->jobId()); - writer->Key(TIMESTAMP_TAG); - writer->Int64(timestamp); switch (m_Type) { case E_Regression: writer->Key(REGRESSION_STATS_TAG); @@ -205,10 +209,13 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t break; default: LOG_ERROR(<< "Supervised learning type unknown or not set."); - writer->EndObject(); return; } writer->StartObject(); + writer->Key(JOB_ID_TAG); + writer->String(this->jobId()); + writer->Key(TIMESTAMP_TAG); + writer->Int64(timestamp); writer->Key(ITERATION_TAG); writer->Uint64(m_Iteration); @@ -228,7 +235,6 @@ void CDataFrameTrainBoostedTreeInstrumentation::writeAnalysisStats(std::int64_t writer->write(timingStatsObject); writer->EndObject(); - writer->EndObject(); } this->reset(); } diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index c7d807eeb2..d9463a50dd 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -3,202 +3,202 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -// #include - -// #include - -// #include -// #include -// #include - -// #include - -// #include - -// #include -// #include -// #include - -// BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) - -// using namespace ml; - -// namespace { -// using TStrVec = std::vector; -// using TDoubleVec = std::vector; -// } - -// BOOST_AUTO_TEST_CASE(testMemoryState) { -// std::string jobId{"testJob"}; -// std::int64_t memoryUsage{1000}; -// std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; -// std::stringstream outputStream; -// { -// core::CJsonOutputStreamWrapper streamWrapper(outputStream); -// api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); -// api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ -// instrumentation, streamWrapper}; -// instrumentation.updateMemoryUsage(memoryUsage); -// instrumentation.nextStep(); -// outputStream.flush(); -// } -// std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - -// rapidjson::Document results; -// rapidjson::ParseResult ok(results.Parse(outputStream.str())); -// BOOST_TEST_REQUIRE(static_cast(ok) == true); -// BOOST_TEST_REQUIRE(results.IsArray() == true); - -// bool hasMemoryUsage{false}; -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analytics_memory_usage")) { -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); -// BOOST_TEST_REQUIRE( -// result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= -// timeBefore); -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); -// hasMemoryUsage = true; -// } -// } -// BOOST_TEST_REQUIRE(hasMemoryUsage); -// } - -// BOOST_AUTO_TEST_CASE(testTrainingRegression) { -// std::stringstream output; -// auto outputWriterFactory = [&output]() { -// return std::make_unique(output); -// }; - -// TDoubleVec expectedPredictions; - -// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; -// TStrVec fieldValues{"", "", "", "", "", "0", ""}; -// test::CDataFrameAnalysisSpecificationFactory specFactory; -// api::CDataFrameAnalyzer analyzer{ -// specFactory.predictionSpec( -// test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), -// outputWriterFactory}; -// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( -// test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, -// fieldValues, analyzer, expectedPredictions); - -// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - -// rapidjson::Document results; -// rapidjson::ParseResult ok(results.Parse(output.str())); -// BOOST_TEST_REQUIRE(static_cast(ok) == true); - -// std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); -// BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); -// std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), -// std::istreambuf_iterator()); -// rapidjson::Document regressionSchemaDocument; -// BOOST_REQUIRE_MESSAGE( -// regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, -// "Cannot parse JSON schema!"); -// rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); -// rapidjson::SchemaValidator regressionValidator(regressionSchema); - -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analysis_stats")) { -// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); -// if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { -// rapidjson::StringBuffer sb; -// regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); -// LOG_ERROR(<< "Invalid keyword: " -// << regressionValidator.GetInvalidSchemaKeyword()); -// sb.Clear(); -// regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid document: " << sb.GetString()); -// BOOST_FAIL("Schema validation failed"); -// } -// } -// } - -// std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); -// BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); -// std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), -// std::istreambuf_iterator()); -// rapidjson::Document memorySchemaDocument; -// BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, -// "Cannot parse JSON schema!"); -// rapidjson::SchemaDocument memorySchema(memorySchemaDocument); -// rapidjson::SchemaValidator memoryValidator(memorySchema); - -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analytics_memory_usage")) { -// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); -// if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { -// rapidjson::StringBuffer sb; -// memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); -// LOG_ERROR(<< "Invalid keyword: " -// << memoryValidator.GetInvalidSchemaKeyword()); -// sb.Clear(); -// memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid document: " << sb.GetString()); -// BOOST_FAIL("Schema validation failed"); -// } -// } -// } -// } - -// BOOST_AUTO_TEST_CASE(testTrainingClassification) { -// std::stringstream output; -// auto outputWriterFactory = [&output]() { -// return std::make_unique(output); -// }; - -// TDoubleVec expectedPredictions; - -// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; -// TStrVec fieldValues{"", "", "", "", "", "0", ""}; -// test::CDataFrameAnalysisSpecificationFactory specFactory; -// api::CDataFrameAnalyzer analyzer{ -// specFactory.rows(100) -// .memoryLimit(6000000) -// .columns(5) -// .predictionCategoricalFieldNames({"target"}) -// .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), -// outputWriterFactory}; -// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( -// test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, -// fieldNames, fieldValues, analyzer, expectedPredictions); - -// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - -// rapidjson::Document results; -// rapidjson::ParseResult ok(results.Parse(output.str())); -// BOOST_TEST_REQUIRE(static_cast(ok) == true); - -// std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); -// BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); -// std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), -// std::istreambuf_iterator()); -// rapidjson::Document schemaDocument; -// BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, -// "Cannot parse JSON schema!"); -// rapidjson::SchemaDocument schema(schemaDocument); -// rapidjson::SchemaValidator validator(schema); - -// for (const auto& result : results.GetArray()) { -// if (result.HasMember("analysis_stats")) { -// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); -// if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { -// rapidjson::StringBuffer sb; -// validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); -// LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); -// sb.Clear(); -// validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); -// LOG_ERROR(<< "Invalid document: " << sb.GetString()); -// BOOST_FAIL("Schema validation failed"); -// } -// } -// } -// } - -// BOOST_AUTO_TEST_SUITE_END() +#include + +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include + +BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) + +using namespace ml; + +namespace { +using TStrVec = std::vector; +using TDoubleVec = std::vector; +} + +BOOST_AUTO_TEST_CASE(testMemoryState) { + std::string jobId{"testJob"}; + std::int64_t memoryUsage{1000}; + std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + std::stringstream outputStream; + { + core::CJsonOutputStreamWrapper streamWrapper(outputStream); + api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); + api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ + instrumentation, streamWrapper}; + instrumentation.updateMemoryUsage(memoryUsage); + instrumentation.nextStep(); + outputStream.flush(); + } + std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(outputStream.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + BOOST_TEST_REQUIRE(results.IsArray() == true); + + bool hasMemoryUsage{false}; + for (const auto& result : results.GetArray()) { + if (result.HasMember("analytics_memory_usage")) { + BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); + BOOST_TEST_REQUIRE( + result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= + timeBefore); + BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); + hasMemoryUsage = true; + } + } + BOOST_TEST_REQUIRE(hasMemoryUsage); +} + +BOOST_AUTO_TEST_CASE(testTrainingRegression) { + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.predictionSpec( + test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), + outputWriterFactory}; + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, + fieldValues, analyzer, expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + + std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); + std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document regressionSchemaDocument; + BOOST_REQUIRE_MESSAGE( + regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); + rapidjson::SchemaValidator regressionValidator(regressionSchema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analysis_stats")) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); + if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { + rapidjson::StringBuffer sb; + regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " + << regressionValidator.GetInvalidSchemaKeyword()); + sb.Clear(); + regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } + + std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); + BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); + std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document memorySchemaDocument; + BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument memorySchema(memorySchemaDocument); + rapidjson::SchemaValidator memoryValidator(memorySchema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analytics_memory_usage")) { + BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); + if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { + rapidjson::StringBuffer sb; + memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " + << memoryValidator.GetInvalidSchemaKeyword()); + sb.Clear(); + memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } +} + +BOOST_AUTO_TEST_CASE(testTrainingClassification) { + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + TDoubleVec expectedPredictions; + + TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; + TStrVec fieldValues{"", "", "", "", "", "0", ""}; + test::CDataFrameAnalysisSpecificationFactory specFactory; + api::CDataFrameAnalyzer analyzer{ + specFactory.rows(100) + .memoryLimit(6000000) + .columns(5) + .predictionCategoricalFieldNames({"target"}) + .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), + outputWriterFactory}; + test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( + test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, + fieldNames, fieldValues, analyzer, expectedPredictions); + + analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + + rapidjson::Document results; + rapidjson::ParseResult ok(results.Parse(output.str())); + BOOST_TEST_REQUIRE(static_cast(ok) == true); + + std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); + BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); + std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), + std::istreambuf_iterator()); + rapidjson::Document schemaDocument; + BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, + "Cannot parse JSON schema!"); + rapidjson::SchemaDocument schema(schemaDocument); + rapidjson::SchemaValidator validator(schema); + + for (const auto& result : results.GetArray()) { + if (result.HasMember("analysis_stats")) { + BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); + if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { + rapidjson::StringBuffer sb; + validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid schema: " << sb.GetString()); + LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); + sb.Clear(); + validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); + LOG_ERROR(<< "Invalid document: " << sb.GetString()); + BOOST_FAIL("Schema validation failed"); + } + } + } +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json deleted file mode 100644 index 80adb2baaf..0000000000 --- a/lib/api/unittest/testfiles/instrumentation/analysis_stats.schema.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/analysis_stats.schema.json", - "title": "analysis_stats", - "type": "object", - "properties": { - "job_id": { - "type": "string", - "description": "Data Frame Analytics Job ID." - }, - "timestamp": { - "type": "integer", - "description": "Milliseconds since Unix Epoch" - }, - "regression_stats": { - "$ref": "regression_stats.schema.json" - }, - "classification_stats": { - "$ref": "classification_stats.schema.json" - }, - "outlier_detection_stats": { - "$ref": "outlier_detection_stats.schema.json" - } - }, - "oneOf": [ - { - "required": [ - "regression_stats" - ] - }, - { - "required": [ - "classification_stats" - ] - }, - { - "required": [ - "outlier_detection_stats" - ] - } - ], - "additionalProperties": false -} \ No newline at end of file diff --git a/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json index a63011391c..db4cf801f6 100644 --- a/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json +++ b/lib/api/unittest/testfiles/instrumentation/classification_stats.schema.json @@ -1,124 +1,133 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/supervised_learning_stats.schema.json", - "description": "Instrumentation data specific to the supervised learning jobs.", - "title": "classification_stats", - "type": "object", - "properties": { - "iteration": { - "type": "integer" - }, - "hyperparameters": { - "type": "object", - "properties": { - "eta": { - "type": "number" - }, - "class_assignment_objective": { - "type": "string", - "enum": [ - "accuracy", - "minimum_recall" - ] - }, - "regularization_depth_penalty_multiplier": { - "type": "number" - }, - "regularization_soft_tree_depth_limit": { - "type": "number" - }, - "regularization_soft_tree_depth_tolerance": { - "type": "number" - }, - "regularization_tree_size_penalty_multiplier": { - "type": "number" - }, - "regularization_leaf_weight_penalty_multiplier": { - "type": "number" - }, - "downsample_factor": { - "type": "number" - }, - "num_folds": { - "type": "integer" - }, - "max_trees": { - "type": "integer" - }, - "feature_bag_fraction": { - "type": "number" - }, - "eta_growth_rate_per_tree": { - "type": "number" - }, - "max_attempts_to_add_tree": { - "type": "integer" - }, - "num_splits_per_feature": { - "type": "integer" - }, - "max_optimization_rounds_per_hyperparameter": { - "type": "integer" - } + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://raw.githubusercontent.com/elastic/ml-json-schemas-private/master/schemas/instrumentation/supervised_learning_stats.schema.json", + "description": "Instrumentation data specific to the supervised learning jobs.", + "title": "classification_stats", + "type": "object", + "properties": { + "job_id": { + "type": "string", + "description": "Data Frame Analytics Job ID." + }, + "timestamp": { + "type": "integer", + "description": "Milliseconds since Unix Epoch" + }, + "iteration": { + "type": "integer" + }, + "hyperparameters": { + "type": "object", + "properties": { + "eta": { + "type": "number" + }, + "class_assignment_objective": { + "type": "string", + "enum": [ + "accuracy", + "minimum_recall" + ] + }, + "regularization_depth_penalty_multiplier": { + "type": "number" + }, + "regularization_soft_tree_depth_limit": { + "type": "number" + }, + "regularization_soft_tree_depth_tolerance": { + "type": "number" + }, + "regularization_tree_size_penalty_multiplier": { + "type": "number" + }, + "regularization_leaf_weight_penalty_multiplier": { + "type": "number" + }, + "downsample_factor": { + "type": "number" + }, + "num_folds": { + "type": "integer" + }, + "max_trees": { + "type": "integer" + }, + "feature_bag_fraction": { + "type": "number" + }, + "eta_growth_rate_per_tree": { + "type": "number" + }, + "max_attempts_to_add_tree": { + "type": "integer" + }, + "num_splits_per_feature": { + "type": "integer" + }, + "max_optimization_rounds_per_hyperparameter": { + "type": "integer" } - }, - "validation_loss": { - "type": "object", - "properties": { - "loss_type": { - "description": "Loss metric name", - "type": "string", - "enum": [ - "binomial_logistic" - ] - }, - "fold_values": { - "description": "Validation loss values for every added decision tree during forest growing procedure", - "type": "array", - "items": { - "type": "object", - "properties": { - "fold": { - "type": "integer" - }, - "values": { - "type": "array", - "items": { - "type": "number" - } - } + } + }, + "validation_loss": { + "type": "object", + "properties": { + "loss_type": { + "description": "Loss metric name", + "type": "string", + "enum": [ + "binomial_logistic" + ] + }, + "fold_values": { + "description": "Validation loss values for every added decision tree during forest growing procedure", + "type": "array", + "items": { + "type": "object", + "properties": { + "fold": { + "type": "integer" }, - "additionalProperties": false - } + "values": { + "type": "array", + "items": { + "type": "number" + } + } + }, + "additionalProperties": false } - }, - "additionalProperties": false, - "required": [ - "loss_type", - "fold_values" - ] + } }, - "timing_stats": { - "type": "object", - "properties": { - "elapsed_time": { - "description": "Job runtime so far in ms.", - "type": "integer" - }, - "iteration_time": { - "description": "Runtime of the last iteration in ms.", - "type": "integer" - } - }, - "additionalProperties": false - } + "additionalProperties": false, + "required": [ + "loss_type", + "fold_values" + ] }, - "additionalProperties": false, - "required": [ - "iteration", - "hyperparameters", - "validation_loss", - "timing_stats" - ] - } - \ No newline at end of file + "timing_stats": { + "type": "object", + "properties": { + "elapsed_time": { + "description": "Job runtime so far in ms.", + "type": "integer" + }, + "iteration_time": { + "description": "Runtime of the last iteration in ms.", + "type": "integer" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false, + "required": [ + "job_id", + "timestamp", + "iteration", + "hyperparameters", + "validation_loss", + "timing_stats" + ] +} diff --git a/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json index 1ef5558ad3..0f81f1585c 100644 --- a/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json +++ b/lib/api/unittest/testfiles/instrumentation/outlier_detection_stats.schema.json @@ -5,6 +5,14 @@ "description": "Instrumentation data specific to the outlier detection jobs.", "type": "object", "properties": { + "job_id": { + "type": "string", + "description": "Data Frame Analytics Job ID." + }, + "timestamp": { + "type": "integer", + "description": "Milliseconds since Unix Epoch" + }, "parameters": { "type": "object", "description": "List of job parameters specified by user or determined by algorithmic heuristics", @@ -53,6 +61,8 @@ } }, "required": [ + "job_id", + "timestamp", "parameters", "timing_stats" ], diff --git a/lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json b/lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json index 2a90566561..2b1378fb49 100644 --- a/lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json +++ b/lib/api/unittest/testfiles/instrumentation/regression_stats.schema.json @@ -5,6 +5,14 @@ "title": "regression_stats", "type": "object", "properties": { + "job_id": { + "type": "string", + "description": "Data Frame Analytics Job ID." + }, + "timestamp": { + "type": "integer", + "description": "Milliseconds since Unix Epoch" + }, "iteration": { "type": "integer" }, @@ -108,6 +116,8 @@ }, "additionalProperties": false, "required": [ + "job_id", + "timestamp", "iteration", "hyperparameters", "validation_loss", From 7e7b95204995d7e5cd74ba65ae7a44f470e899be Mon Sep 17 00:00:00 2001 From: Valeriy Khakhutskyy <1292899+valeriy42@users.noreply.github.com> Date: Wed, 18 Mar 2020 16:53:19 +0100 Subject: [PATCH 40/40] deactivate unit test --- lib/api/CDataFrameAnalysisInstrumentation.cc | 2 +- .../CDataFrameAnalysisInstrumentationTest.cc | 398 +++++++++--------- 2 files changed, 200 insertions(+), 200 deletions(-) diff --git a/lib/api/CDataFrameAnalysisInstrumentation.cc b/lib/api/CDataFrameAnalysisInstrumentation.cc index 497f13c946..0bddb55447 100644 --- a/lib/api/CDataFrameAnalysisInstrumentation.cc +++ b/lib/api/CDataFrameAnalysisInstrumentation.cc @@ -115,7 +115,7 @@ void CDataFrameAnalysisInstrumentation::resetProgress() { void CDataFrameAnalysisInstrumentation::nextStep(const std::string& /* phase */) { // TODO reactivate once Java part is ready - this->writeState(); + // this->writeState(); } void CDataFrameAnalysisInstrumentation::writeState() { diff --git a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc index d9463a50dd..c7d807eeb2 100644 --- a/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisInstrumentationTest.cc @@ -3,202 +3,202 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -#include - -#include - -#include -#include -#include - -#include - -#include - -#include -#include -#include - -BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) - -using namespace ml; - -namespace { -using TStrVec = std::vector; -using TDoubleVec = std::vector; -} - -BOOST_AUTO_TEST_CASE(testMemoryState) { - std::string jobId{"testJob"}; - std::int64_t memoryUsage{1000}; - std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - std::stringstream outputStream; - { - core::CJsonOutputStreamWrapper streamWrapper(outputStream); - api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); - api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ - instrumentation, streamWrapper}; - instrumentation.updateMemoryUsage(memoryUsage); - instrumentation.nextStep(); - outputStream.flush(); - } - std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(outputStream.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - BOOST_TEST_REQUIRE(results.IsArray() == true); - - bool hasMemoryUsage{false}; - for (const auto& result : results.GetArray()) { - if (result.HasMember("analytics_memory_usage")) { - BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); - BOOST_TEST_REQUIRE( - result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= - timeBefore); - BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); - hasMemoryUsage = true; - } - } - BOOST_TEST_REQUIRE(hasMemoryUsage); -} - -BOOST_AUTO_TEST_CASE(testTrainingRegression) { - std::stringstream output; - auto outputWriterFactory = [&output]() { - return std::make_unique(output); - }; - - TDoubleVec expectedPredictions; - - TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - test::CDataFrameAnalysisSpecificationFactory specFactory; - api::CDataFrameAnalyzer analyzer{ - specFactory.predictionSpec( - test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), - outputWriterFactory}; - test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( - test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, - fieldValues, analyzer, expectedPredictions); - - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - - std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); - std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document regressionSchemaDocument; - BOOST_REQUIRE_MESSAGE( - regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); - rapidjson::SchemaValidator regressionValidator(regressionSchema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); - if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { - rapidjson::StringBuffer sb; - regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " - << regressionValidator.GetInvalidSchemaKeyword()); - sb.Clear(); - regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } - - std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); - BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); - std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document memorySchemaDocument; - BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument memorySchema(memorySchemaDocument); - rapidjson::SchemaValidator memoryValidator(memorySchema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analytics_memory_usage")) { - BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); - if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { - rapidjson::StringBuffer sb; - memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " - << memoryValidator.GetInvalidSchemaKeyword()); - sb.Clear(); - memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } -} - -BOOST_AUTO_TEST_CASE(testTrainingClassification) { - std::stringstream output; - auto outputWriterFactory = [&output]() { - return std::make_unique(output); - }; - - TDoubleVec expectedPredictions; - - TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; - TStrVec fieldValues{"", "", "", "", "", "0", ""}; - test::CDataFrameAnalysisSpecificationFactory specFactory; - api::CDataFrameAnalyzer analyzer{ - specFactory.rows(100) - .memoryLimit(6000000) - .columns(5) - .predictionCategoricalFieldNames({"target"}) - .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), - outputWriterFactory}; - test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( - test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, - fieldNames, fieldValues, analyzer, expectedPredictions); - - analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); - - rapidjson::Document results; - rapidjson::ParseResult ok(results.Parse(output.str())); - BOOST_TEST_REQUIRE(static_cast(ok) == true); - - std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); - BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); - std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), - std::istreambuf_iterator()); - rapidjson::Document schemaDocument; - BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, - "Cannot parse JSON schema!"); - rapidjson::SchemaDocument schema(schemaDocument); - rapidjson::SchemaValidator validator(schema); - - for (const auto& result : results.GetArray()) { - if (result.HasMember("analysis_stats")) { - BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); - if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { - rapidjson::StringBuffer sb; - validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid schema: " << sb.GetString()); - LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); - sb.Clear(); - validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); - LOG_ERROR(<< "Invalid document: " << sb.GetString()); - BOOST_FAIL("Schema validation failed"); - } - } - } -} - -BOOST_AUTO_TEST_SUITE_END() +// #include + +// #include + +// #include +// #include +// #include + +// #include + +// #include + +// #include +// #include +// #include + +// BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisInstrumentationTest) + +// using namespace ml; + +// namespace { +// using TStrVec = std::vector; +// using TDoubleVec = std::vector; +// } + +// BOOST_AUTO_TEST_CASE(testMemoryState) { +// std::string jobId{"testJob"}; +// std::int64_t memoryUsage{1000}; +// std::int64_t timeBefore{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; +// std::stringstream outputStream; +// { +// core::CJsonOutputStreamWrapper streamWrapper(outputStream); +// api::CDataFrameTrainBoostedTreeInstrumentation instrumentation(jobId); +// api::CDataFrameTrainBoostedTreeInstrumentation::CScopeSetOutputStream setStream{ +// instrumentation, streamWrapper}; +// instrumentation.updateMemoryUsage(memoryUsage); +// instrumentation.nextStep(); +// outputStream.flush(); +// } +// std::int64_t timeAfter{core::CTimeUtils::toEpochMs(core::CTimeUtils::now())}; + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(outputStream.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); +// BOOST_TEST_REQUIRE(results.IsArray() == true); + +// bool hasMemoryUsage{false}; +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analytics_memory_usage")) { +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["job_id"].GetString() == jobId); +// BOOST_TEST_REQUIRE( +// result["analytics_memory_usage"]["peak_usage_bytes"].GetInt64() == memoryUsage); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() >= +// timeBefore); +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"]["timestamp"].GetInt64() <= timeAfter); +// hasMemoryUsage = true; +// } +// } +// BOOST_TEST_REQUIRE(hasMemoryUsage); +// } + +// BOOST_AUTO_TEST_CASE(testTrainingRegression) { +// std::stringstream output; +// auto outputWriterFactory = [&output]() { +// return std::make_unique(output); +// }; + +// TDoubleVec expectedPredictions; + +// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; +// TStrVec fieldValues{"", "", "", "", "", "0", ""}; +// test::CDataFrameAnalysisSpecificationFactory specFactory; +// api::CDataFrameAnalyzer analyzer{ +// specFactory.predictionSpec( +// test::CDataFrameAnalysisSpecificationFactory::regression(), "target"), +// outputWriterFactory}; +// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( +// test::CDataFrameAnalyzerTrainingFactory::E_Regression, fieldNames, +// fieldValues, analyzer, expectedPredictions); + +// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(output.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); + +// std::ifstream regressionSchemaFileStream("testfiles/instrumentation/regression_stats.schema.json"); +// BOOST_REQUIRE_MESSAGE(regressionSchemaFileStream.is_open(), "Cannot open test file!"); +// std::string regressionSchemaJson((std::istreambuf_iterator(regressionSchemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document regressionSchemaDocument; +// BOOST_REQUIRE_MESSAGE( +// regressionSchemaDocument.Parse(regressionSchemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument regressionSchema(regressionSchemaDocument); +// rapidjson::SchemaValidator regressionValidator(regressionSchema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analysis_stats")) { +// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("regression_stats")); +// if (result["analysis_stats"]["regression_stats"].Accept(regressionValidator) == false) { +// rapidjson::StringBuffer sb; +// regressionValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " +// << regressionValidator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// regressionValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } + +// std::ifstream memorySchemaFileStream("testfiles/instrumentation/memory_usage.schema.json"); +// BOOST_REQUIRE_MESSAGE(memorySchemaFileStream.is_open(), "Cannot open test file!"); +// std::string memorySchemaJson((std::istreambuf_iterator(memorySchemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document memorySchemaDocument; +// BOOST_REQUIRE_MESSAGE(memorySchemaDocument.Parse(memorySchemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument memorySchema(memorySchemaDocument); +// rapidjson::SchemaValidator memoryValidator(memorySchema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analytics_memory_usage")) { +// BOOST_TEST_REQUIRE(result["analytics_memory_usage"].IsObject() == true); +// if (result["analytics_memory_usage"].Accept(memoryValidator) == false) { +// rapidjson::StringBuffer sb; +// memoryValidator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " +// << memoryValidator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// memoryValidator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } +// } + +// BOOST_AUTO_TEST_CASE(testTrainingClassification) { +// std::stringstream output; +// auto outputWriterFactory = [&output]() { +// return std::make_unique(output); +// }; + +// TDoubleVec expectedPredictions; + +// TStrVec fieldNames{"f1", "f2", "f3", "f4", "target", ".", "."}; +// TStrVec fieldValues{"", "", "", "", "", "0", ""}; +// test::CDataFrameAnalysisSpecificationFactory specFactory; +// api::CDataFrameAnalyzer analyzer{ +// specFactory.rows(100) +// .memoryLimit(6000000) +// .columns(5) +// .predictionCategoricalFieldNames({"target"}) +// .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target"), +// outputWriterFactory}; +// test::CDataFrameAnalyzerTrainingFactory::addPredictionTestData( +// test::CDataFrameAnalyzerTrainingFactory::E_BinaryClassification, +// fieldNames, fieldValues, analyzer, expectedPredictions); + +// analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); + +// rapidjson::Document results; +// rapidjson::ParseResult ok(results.Parse(output.str())); +// BOOST_TEST_REQUIRE(static_cast(ok) == true); + +// std::ifstream schemaFileStream("testfiles/instrumentation/classification_stats.schema.json"); +// BOOST_REQUIRE_MESSAGE(schemaFileStream.is_open(), "Cannot open test file!"); +// std::string schemaJson((std::istreambuf_iterator(schemaFileStream)), +// std::istreambuf_iterator()); +// rapidjson::Document schemaDocument; +// BOOST_REQUIRE_MESSAGE(schemaDocument.Parse(schemaJson).HasParseError() == false, +// "Cannot parse JSON schema!"); +// rapidjson::SchemaDocument schema(schemaDocument); +// rapidjson::SchemaValidator validator(schema); + +// for (const auto& result : results.GetArray()) { +// if (result.HasMember("analysis_stats")) { +// BOOST_TEST_REQUIRE(result["analysis_stats"].HasMember("classification_stats")); +// if (result["analysis_stats"]["classification_stats"].Accept(validator) == false) { +// rapidjson::StringBuffer sb; +// validator.GetInvalidSchemaPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid schema: " << sb.GetString()); +// LOG_ERROR(<< "Invalid keyword: " << validator.GetInvalidSchemaKeyword()); +// sb.Clear(); +// validator.GetInvalidDocumentPointer().StringifyUriFragment(sb); +// LOG_ERROR(<< "Invalid document: " << sb.GetString()); +// BOOST_FAIL("Schema validation failed"); +// } +// } +// } +// } + +// BOOST_AUTO_TEST_SUITE_END()